diff --git "a/logs/main_log.txt" "b/logs/main_log.txt" --- "a/logs/main_log.txt" +++ "b/logs/main_log.txt" @@ -1,124944 +1,3 @@ -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op name op name................ ................installed installed.. ..compatible -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adam ...............cpu_adam [YES] ..................... [YES][OKAY] -...... [OKAY] -fused_adam ............. [NO]fused_adam ....... .............[OKAY] -[NO] .......fused_lamb .............[OKAY] [NO] - ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] .......sparse_attn [OKAY] -............ transformer[NO] ................... [NO][OKAY] -....... [OKAY] -transformer ............ stochastic_transformer[NO] ........ [OKAY][NO] - ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -ninjafused_adam ............................... [NO][OKAY] -....... [OKAY]-------------------------------------------------- - -op name ................ installedfused_lamb ............... compatible[NO] - .......-------------------------------------------------- -[OKAY] -cpu_adam ............... [YES] sparse_attn...... ............[OKAY] [NO] - ....... [OKAY] -transformer ............ [NO] .......fused_adam [OKAY]............. - [NO] ....... [OKAY]stochastic_transformer - . [NO]fused_lamb .................... [OKAY] - [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. .................. [OKAY][OKAY][OKAY] -[OKAY] - --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -op name -op nameop name op name................................ ................installed................installed installed....installed .. compatible compatible.. - -compatible ----------------------------------------------------------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam ...............cpu_adam............... cpu_adam[YES] ...............[YES]...... ............... [YES] [OKAY] ......[YES] -...... [OKAY][OKAY]...... - - [OKAY] -fused_adam ............. fused_adam[NO] fused_adam............. fused_adam ....... [NO] ............. .............[OKAY] ....... - [NO][NO][OKAY] - fused_lamb.......fused_lamb....... .............[OKAY].............[OKAY] -[NO][NO] - fused_lamb.............. .............[OKAY]fused_lamb[OKAY] - - [NO]............. .......[NO] [OKAY]....... - [OKAY] -sparse_attn ............sparse_attn [NO]............ .......[NO] sparse_attn sparse_attn[OKAY] ....... - ........................[OKAY] -transformer[NO][NO] ............transformer ............ ..............[NO] [OKAY][NO].......[OKAY] -....... - [OKAY][OKAY]transformer - -transformer ........................ stochastic_transformer[NO] stochastic_transformer [NO] ........ . .......[NO][NO][OKAY] -[OKAY].............. stochastic_transformer[OKAY] - [OKAY] - -.stochastic_transformer [NO] ........ [OKAY][NO] - ....... [OKAY] --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -------------------------------------------------------------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY] -[OKAY][OKAY] --------------------------------------------------- - - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - -................op nameop nameop name ................ ................ installed ................ installedinstalled .. installed .. .. compatible compatible.. - -compatible --------------------------------------------------compatible-------------------------------------------------- - - - --------------------------------------------------- --------------------------------------------------- -cpu_adamcpu_adam ...............cpu_adamcpu_adam............... [YES] ............... ...............[YES] ...... [YES]......[YES] [OKAY] -......[OKAY]...... - [OKAY][OKAY] - -fused_adam ............. fused_adam[NO] fused_adam ............. fused_adam.......[NO]............. .......[NO][OKAY]............. [OKAY][NO] - -....... ....... [OKAY]fused_lamb[OKAY]fused_lamb ............. -............. - [NO][NO]fused_lamb ....................fused_lamb .......[OKAY] -.............[NO][OKAY] -[NO] .............. [OKAY][OKAY] - -sparse_attn ............sparse_attn [NO]............ [NO]....... sparse_attn....... sparse_attn [OKAY] ............[OKAY] - -............ transformertransformer[NO][NO] ...................................... [NO][NO][OKAY][OKAY] -....... - .......[OKAY]transformer -transformer [OKAY] ............ -............stochastic_transformer [NO][NO] stochastic_transformer. ....... [NO]....... [OKAY].......[OKAY] -[OKAY]. - - [NO] .......stochastic_transformer stochastic_transformer[OKAY] -.. [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninja ..................ninja ....................................[OKAY] -..................[OKAY][OKAY] -------------------------------------------------- - - -[OKAY]op name-------------------------------------------------- - ---------------------------------------------------................-------------------------------------------------- - op name -installed op name op name................ .. ................ ................ installedcompatible installed -installed -------------------------------------------------- ...... - compatiblecompatiblecompatible - - ----------------------------------------------------------------------------------------------------- -cpu_adam-------------------------------------------------- - -............... [YES] ...... [OKAY] -cpu_adam ...............cpu_adamcpu_adam [YES]............... ............... [YES] ...... fused_adam[YES] ...... [OKAY]............. -......[OKAY] -[NO][OKAY] -....... [OKAY] -fused_adam fused_lamb............. fused_adam.............[NO] .............[NO].......fused_adam [NO] ....................[OKAY] -[OKAY]....... -[NO] [OKAY]fused_lamb -....... .............[OKAY] -[NO] fused_lamb....... .............fused_lambsparse_attn [OKAY] [NO] -......................... .......[NO][NO] [OKAY] ....... - .......[OKAY] -[OKAY] -sparse_attntransformer ........................ [NO][NO] .............. [OKAY][OKAY] -sparse_attn - transformer............ sparse_attn[NO]............ stochastic_transformer...................[NO] [OKAY] [NO]........ - [NO].......[OKAY]transformer - [OKAY]................... - [OKAY][NO]transformerstochastic_transformer - ................... . [OKAY] [NO] -[NO] ....... .......[OKAY] -stochastic_transformer[OKAY] -stochastic_transformer. [NO] ........ [OKAY][NO] - ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................. ..................[OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------op name - -op name op name op name................ ................ ................ ................ installed installed installed .... installed compatible..compatible - -.. -------------------------------------------------- --------------------------------------------------compatible -compatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam cpu_adamcpu_adam............... ............... ...............[YES]............... [YES] ......[YES] [YES] ...... [OKAY] ............ -[OKAY] [OKAY] -[OKAY] - -fused_adamfused_adamfused_adamfused_adam ............. .......................................[NO] [NO] [NO][NO] ....... ....... .............. [OKAY][OKAY][OKAY][OKAY] - - - -fused_lambfused_lambfused_lamb fused_lamb .......................... ..........................[NO] [NO] [NO]....... [NO] .............. [OKAY] ....... [OKAY] - -[OKAY][OKAY] - -sparse_attn ............ sparse_attn[NO] sparse_attn................... sparse_attn[OKAY] ............[NO] -............ [NO].......transformer[NO] .......[OKAY]................... - [OKAY][OKAY][NO]transformer - - ................... [OKAY]transformer[NO] transformer -............ ................... stochastic_transformer[NO][OKAY][NO] - ............... [OKAY]stochastic_transformer[OKAY][NO] - - ........ [OKAY]stochastic_transformerstochastic_transformer[NO] - ......... [OKAY][NO][NO] - .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op name -op name op name ................op name................ ................ installed ................ installed installed.. installed....compatible - ..--------------------------------------------------compatible - -compatiblecompatible - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [YES] ...... cpu_adamcpu_adamcpu_adam [OKAY] ............... -.............................. [YES][YES][YES] .................. [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO] ....... [OKAY] -fused_lambfused_adamfused_adam fused_adam .......................... ............. .............[NO][NO][NO] [NO]..................... .......[OKAY][OKAY][OKAY] - - - [OKAY]fused_lamb - fused_lamb............. .............[NO]fused_lamb [NO].................... .......[OKAY][NO] - [OKAY].......sparse_attn - [OKAY]............ -[NO] ....... [OKAY] -sparse_attn transformer............sparse_attn ............[NO]............sparse_attn [NO] ....... [NO]................... [OKAY] [OKAY] -....... -[NO] [OKAY]transformer.......stochastic_transformer - ............[OKAY]. transformer - [NO] [NO]transformer ............ ....... .......[NO] ...................[OKAY][OKAY] - -[OKAY][NO] - .......stochastic_transformer [OKAY] -stochastic_transformer. [NO] ........stochastic_transformer [NO][OKAY] . -....... [NO][OKAY] -....... [OKAY] -ninjaninjaninjaninja .................. .................. .................. ..................[OKAY][OKAY][OKAY] - -[OKAY] --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op name -op name op name ................ ................op name ................ installed ................ installedinstalled .. installed .... compatible -..compatible-------------------------------------------------- compatible -compatible - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -cpu_adam ............... [YES] ......cpu_adamcpu_adamcpu_adam ...............[OKAY].............................. - [YES][YES][YES] .................. [OKAY] [OKAY] -[OKAY]fused_adam - - ............. [NO] ....... [OKAY] -fused_adam fused_lambfused_adam.............fused_adam [NO] .......................... ............. .......[NO] [NO] [NO].......[OKAY] -..............[OKAY] -[OKAY][OKAY]fused_lamb - - .............fused_lamb [NO].............fused_lamb .......[NO]............. [OKAY]sparse_attn.......[NO] - ............[OKAY]....... - [NO][OKAY] -....... [OKAY] -sparse_attntransformer ........................ sparse_attn[NO][NO] .......sparse_attn............ ....... [OKAY][NO] -............[OKAY] -.......transformer[NO] [OKAY]stochastic_transformer................... - [NO]. [OKAY] .......transformer - [NO] [OKAY] .......transformer -............ [OKAY]............ -[NO] stochastic_transformer [NO] ....... ........[OKAY] -[OKAY][NO] - ....... [OKAY]stochastic_transformer - stochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninjaninja .................. ....................................[OKAY] ..................[OKAY] -[OKAY] - -[OKAY]---------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name-------------------------------------------------- op name - -................ op nameop name ................ installed ................................installed .. installed..installed compatible compatible -.... --------------------------------------------------- -------------------------------------------------- - -compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam ...............cpu_adam [YES]............... cpu_adam[YES]cpu_adam...... ...... .............................. [OKAY] [OKAY] -[YES][YES] - ............ [OKAY][OKAY] - -fused_adam ............. [NO] fused_adam....... .............fused_adam[OKAY] fused_adam -[NO] ....................fused_lamb............. ............. [NO][OKAY] - [NO] .......[NO]....... fused_lamb [OKAY]....... [OKAY] ............. - -[OKAY] -[NO]fused_lamb fused_lamb ....... ............. ............. [OKAY] [NO] -[NO] ..............sparse_attn [OKAY][OKAY]............ - - [NO] ....... [OKAY] -sparse_attn ............ [NO]transformer ................... sparse_attn[OKAY] sparse_attn -[NO] ............ ............transformer ....... [NO][NO] ............[OKAY] -.............. [NO] [OKAY]stochastic_transformer [OKAY] - -........ transformer transformer[OKAY][NO] -............................... [NO]stochastic_transformer[OKAY][NO] - .............. .[OKAY] -[OKAY][NO] -.......stochastic_transformer [OKAY]stochastic_transformer - . [NO]. .......[NO] [OKAY]....... - [OKAY] ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja-------------------------------------------------- - - --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................................................................ installedinstalledinstalledinstalled ........ compatiblecompatiblecompatible - -compatible ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - --------------------------------------------------- -cpu_adam ............... cpu_adam[YES]cpu_adamcpu_adam .................................... ............... [YES][OKAY] -[YES] [YES] ............ ......[OKAY] -[OKAY] -fused_adam[OKAY] -............. [NO] .......fused_adam [OKAY]fused_adam............. - .............[NO] fused_lamb[NO]....... fused_adam[OKAY].................... - [NO][OKAY] fused_lamb ............. - .................... [OKAY]fused_lamb[NO] - [NO].................... .......[OKAY][NO] - [OKAY]....... [OKAY] -sparse_attn ............ [NO] ....... -[OKAY]sparse_attn - ............ [NO]transformer sparse_attn ....... ............fused_lamb ............ [OKAY] -[NO][NO] ....... .......transformer............. [OKAY] [OKAY] -............ - [NO] transformer....... stochastic_transformer[NO] ............ [OKAY] -........[NO]stochastic_transformer ....... [OKAY][OKAY][NO]. - - .......[NO] stochastic_transformer [OKAY] ....... - .[OKAY] -[NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO]utils ......................... [NO][YES] - ...... [OKAY] -quantizer .............. [NO] .......transformer_inference [OKAY].. - [NO] .......-------------------------------------------------- -[OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum-------------------------------------------------- - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference [WARNING]  async_io: please install the libaio-devel package with yum .. [NO] -....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -------------------------------------------------------------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name - op name ................op name................ installed................installed................ ....installed installed compatible compatible -.... --------------------------------------------------- compatible-------------------------------------------------- -compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [YES]cpu_adam cpu_adamcpu_adam..................... ...............[YES][OKAY]............... -...... [YES] [YES] [OKAY] ...... -...... [OKAY][OKAY] - -fused_adam ............. [NO] ....... fused_adam[OKAY] -.............fused_adam fused_adam [NO]fused_lamb .......................... ............. [NO].......[NO] [NO] [OKAY]....... - .............. [OKAY] [OKAY] -fused_lamb[OKAY] - -.............fused_lamb fused_lamb[NO] .......................... .......[NO] [NO] [OKAY] ....... -.......sparse_attn [OKAY][OKAY]............ - - [NO] ....... [OKAY] -transformersparse_attn ........................ [NO][NO] sparse_attn sparse_attn.............. [OKAY] ............[OKAY]............ - - [NO][NO] transformer.......stochastic_transformer [OKAY] ................... - . [OKAY][NO][NO]transformer - .......transformer................... [OKAY][OKAY]............[NO] - - [NO]....... stochastic_transformer.......[OKAY] -[OKAY]. - stochastic_transformer[NO] .......stochastic_transformer .[OKAY] -[NO]. [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - - --------------------------------------------------- -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - - op nameop nameop name................ ................ ................ ................ installedinstalled ..installed installed ....compatible .. - compatible --------------------------------------------------compatible -compatible - - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -cpu_adam ...............cpu_adam [YES]...............cpu_adam cpu_adam [YES]...... ............... ............... ......[OKAY] [YES][OKAY] - [YES] -...... ......[OKAY] -[OKAY] -fused_adam ............. fused_adam[NO] ....................fused_adam [OKAY]fused_adam[NO]............. - ....................[NO] ....... [NO][OKAY]fused_lamb - [OKAY].................... - fused_lamb[OKAY][NO] -.............fused_lamb....... fused_lamb [NO]............. .......[OKAY]............. [NO] - [NO][OKAY]....... - .......[OKAY] -[OKAY] -sparse_attn ............ [NO] sparse_attn.......sparse_attn [OKAY]sparse_attn........................ - [NO]............[NO]transformer ..........................[NO] [OKAY][NO]....... [OKAY] -....... -[OKAY] -transformer[OKAY]transformer -transformer............ ............stochastic_transformer............[NO] .......[NO] [NO]. [OKAY] ....... -....... [NO] [OKAY].......[OKAY]stochastic_transformer -[OKAY] -. - [NO]stochastic_transformer ....... stochastic_transformer [OKAY]. - .[NO] [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]  [WARNING]  async_io: please install the libaio-devel package with yum....... [NO] - -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -...................deepspeed info 0.5.5+cd7967d, cd7967d, master................... - deepspeed wheel compiled w.0.5.5+cd7967d, cd7967d, master -......deepspeed wheel compiled w. torch 1.8, cuda 11.1 -...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ......async_io [OKAY] - ............... [NO] .......quantizer [NO].............. - [NO] ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -.................... 1.8.1 -torch version torch cuda version.................... ...............1.8.1 -11.1 -torch cuda versionnvcc version .................................... 11.111.2 - -nvcc versiondeepspeed install path ................................ 11.2 -['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']deepspeed install path - ...........deepspeed info ...................['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -0.5.5+cd7967d, cd7967d, masterdeepspeed info - deepspeed wheel compiled w.................... ......0.5.5+cd7967d, cd7967d, master -torch 1.8, cuda 11.1deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -------------------------------------------------------------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninja - - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum -/bin/sh: line 0: type: git: not found - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -/bin/sh: line 0: type: git: not found -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY] [OKAY] - -[OKAY] ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op nameop name - op name ................ ................op name................ installedinstalled................installed .. installed.... ..compatiblecompatible compatible - - -compatible---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - --------------------------------------------------- -cpu_adam cpu_adamcpu_adam............... cpu_adam .............................. [YES] ............... [YES]......[YES] [YES]............[OKAY] -...... [OKAY][OKAY] - -[OKAY] -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -fused_adam .............fused_adam fused_adam[NO]............. fused_adam ............. [NO] ....... .............[NO]....... [OKAY].......[NO][OKAY] - -[OKAY]....... -fused_lamb fused_lamb [OKAY] ............. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -............. fused_lamb[NO] fused_lamb[NO] ............. ....... .................... [NO][NO][OKAY][OKAY] - -.............. [OKAY] -[OKAY] -async_io ............... [NO] ....... [NO] -sparse_attn ............sparse_attn [NO]............ sparse_attn.......[NO] sparse_attn...................[OKAY] [OKAY] -[NO]............ - transformer....... transformer[NO] ............ [OKAY]............ ....... -transformer_inference .. [NO] ....... [OKAY] -[NO] [OKAY][NO]transformer....... - ...................[OKAY]transformer -utils .................. [YES] ...... [OKAY] - [NO][OKAY]............ - .......stochastic_transformer[NO] stochastic_transformer [OKAY] ........ - . [OKAY] [NO] -[NO] ..............stochastic_transformer stochastic_transformer [OKAY] [OKAY] - -.. [NO][NO] ....... .......[OKAY] -quantizer .............. [NO] ....... [OKAY] -[OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -/bin/sh: line 0: type: git: not found - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY] [OKAY] - -[OKAY] ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name-------------------------------------------------- op name -................ - ................op nameop nameinstalled installed.................................. compatible ..installed - installed--------------------------------------------------compatible -.. - ..compatible-------------------------------------------------- - -compatible -----------------------------------------------------------------------------------------------------cpu_adam - - ............... [YES]cpu_adam ..................... cpu_adamcpu_adam[OKAY] ............... -...............[YES] [YES] [YES]............ ......[OKAY] [OKAY] - -fused_adam[OKAY] -............. [NO] ....... [OKAY] -fused_adamfused_lambfused_adam fused_adam............. ............. ............. ............. [NO][NO] [NO][NO] ....... .............. ....... [OKAY] [OKAY][OKAY][OKAY] - - - -fused_lambfused_lambfused_lamb ....................................... [NO][NO] [NO] sparse_attn....... ....... ............ [OKAY]....... -[OKAY][NO] -[OKAY]....... - [OKAY] -transformer ............ [NO] sparse_attn....... sparse_attn............[OKAY] -............sparse_attn[NO] [NO]stochastic_transformer................... .......[OKAY].[NO] -[OKAY] -[NO]....... transformer transformer....... [OKAY][OKAY] ............ - -............ [NO][NO]transformer .......................... [OKAY][OKAY][NO] - - ....... [OKAY] -stochastic_transformerstochastic_transformer ..stochastic_transformer [NO][NO] . ....... .......[NO][OKAY] -.......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninja ...................................................... ninja[OKAY][OKAY][OKAY] - - -..................-------------------------------------------------- ---------------------------------------------------------------------------------------------------- -[OKAY] - -op name -op name op name................-------------------------------------------------- - ................ installedop name................ ..................installed installed installedcompatible.. - .. --------------------------------------------------.. compatiblecompatible - - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [YES] ......cpu_adam cpu_adam [OKAY] - cpu_adam.............................. ...............[YES][YES] [YES]...... ...... ...... fused_adam[OKAY] [OKAY][OKAY] - - -............. [NO] ....... [OKAY] -fused_lambfused_adamfused_adam fused_adam .......................... ............. [NO][NO] .................... [NO] .......[OKAY] - [NO][OKAY]....... -....... [OKAY][OKAY] - -fused_lamb ............. fused_lambfused_lamb[NO] sparse_attn............. ................................[NO] [NO] [OKAY][NO]....... - ....... ....... [OKAY] [OKAY] -[OKAY] - -transformer ............ [NO] sparse_attn....... ............[OKAY] -[NO]sparse_attn .......sparse_attnstochastic_transformer ............[OKAY] - ............transformer[NO] . [NO]...................[NO] [NO][OKAY]....... ....... ....... -[OKAY] [OKAY] - -[OKAY]transformer - ............ stochastic_transformertransformer[NO] ............. ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY]stochastic_transformer - - . stochastic_transformer[NO] ........ [OKAY][NO] - ....... [OKAY] -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] ....... .......[NO] -[NO] -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY] -....... [OKAY] -utils .................. utils[YES] ........................ [YES][OKAY] -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -...... [OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam ...............cpu_adam [YES]............... ......[YES] ......[OKAY] -[OKAY] -fused_adam .............fused_adam [NO]............. .......[NO] [OKAY]....... - [OKAY] -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed ninja.. compatible -.................. --------------------------------------------------[OKAY] - --------------------------------------------------- -op name ................ installedcpu_adam ................. compatible[YES] - ......-------------------------------------------------- -[OKAY] -cpu_adam ...............fused_adam [YES]............. ......[NO] [OKAY]....... - [OKAY] -fused_lamb ............. [NO] fused_adam....... .............[OKAY] -[NO] ....... [OKAY] -fused_lamb ............. [NO] .......sparse_attn [OKAY]............ - [NO] ....... [OKAY] -transformer ............ [NO] .......sparse_attn [OKAY]............ - [NO] ....... stochastic_transformer[OKAY] -. [NO]transformer ................... [OKAY][NO] - ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install pathtorch version .................... ...............1.8.1 -torch cuda version ...............['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -11.1 -nvcc versiontorch version ......................................... 11.21.8.1 - -deepspeed install path ...........torch cuda version ...............['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -11.1deepspeed info - nvcc version................... .....................0.5.5+cd7967d, cd7967d, master -11.2 -deepspeed wheel compiled w. deepspeed install path...... ...........torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -torch install pathDeepSpeed general environment info: ............... - torch install path['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -............... torch version .................... 1.8.1['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch cuda version torch version............... ....................11.1 -1.8.1nvcc version - ..................... torch cuda version11.2 -...............deepspeed install path 11.1........... - nvcc version ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']..................... - deepspeed info11.2 -...................deepspeed install path 0.5.5+cd7967d, cd7967d, master........... - deepspeed wheel compiled w. ......['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -torch 1.8, cuda 11.1deepspeed info - ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja -JIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------op name - - op name................ op name................op name installed................installed................ ....installed installed compatible.. compatible .. -compatible - ---------------------------------------------------------------------------------------------------- - -compatible --------------------------------------------------- - --------------------------------------------------- -cpu_adam ...............cpu_adam cpu_adam [YES]............... cpu_adam ...............[YES] .....................[YES] ......[YES][OKAY]...... -[OKAY] ...... -[OKAY] -[OKAY] -fused_adamfused_adam ..........................fused_adam [NO]fused_adam [NO] .......................... ....... .......[NO] [NO] [OKAY] [OKAY] -....... -....... [OKAY][OKAY]fused_lambfused_lamb - - ............. fused_lamb.............[NO] [NO].......fused_lamb............. [OKAY].......[NO]............. - [OKAY][NO] -....... [OKAY] - ....... [OKAY] -sparse_attn ............sparse_attn sparse_attn[NO]............ ...................[NO] sparse_attn[OKAY].......[NO] - [OKAY]....... - transformer............[OKAY]transformer ............ -[NO] ............[NO] transformer .......................... [NO] [OKAY] - [NO].......[OKAY] -.......[OKAY]stochastic_transformer transformer - [OKAY] -............stochastic_transformer. [NO]stochastic_transformer[NO] ................ [OKAY][NO] [OKAY]....... - -[NO] [OKAY]....... - [OKAY]stochastic_transformer - . [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaJIT compiled ops requires ninja - - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................................................................ installedinstalledinstalledinstalled .. .. .... compatiblecompatible - -compatiblecompatible-------------------------------------------------- --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... cpu_adam[YES] cpu_adam ............... ............... ..................... [YES][YES][YES][OKAY] .................. - [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO] fused_adamfused_adam .......fused_adam .......................... [OKAY]............. [NO] -[NO] [NO] ....... .......fused_lamb....... [OKAY][OKAY]............. -[OKAY] - -[NO] ....... fused_lamb[OKAY]fused_lamb -fused_lamb ....................................... [NO][NO][NO] ..................... [OKAY]sparse_attn[OKAY][OKAY] - -............ - [NO] ....... [OKAY] -transformer ............ [NO]sparse_attn sparse_attn....... sparse_attn ........................ [OKAY] -............[NO] [NO] [NO] .......stochastic_transformer ....... .......[OKAY] . -[OKAY] [OKAY] -[NO] -transformer transformer ....... transformer............ ............ [OKAY] [NO]............ - [NO][NO]....... ....... .......[OKAY][OKAY] - -[OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer ... [NO][NO] .......[NO]....... [OKAY].......[OKAY] - -[OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja ---------------------------------------------------DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- ---------------------------------------------------op name -................op name - op name................installedop name .. installed................ ................ compatible ..installed -installed compatible-------------------------------------------------- - .. -.. compatiblecompatible --------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY]cpu_adamcpu_adam - .............................. cpu_adam [YES][YES] ............ fused_adam[OKAY] -............................[OKAY] -[NO] ....... [OKAY] -[YES] ......fused_lambfused_adam [OKAY] .............fused_adam [NO]............. -.................... [NO][NO][OKAY] -.............. [OKAY][OKAY] - -fused_adamfused_lambfused_lamb ............. [NO] sparse_attn....... ............ [OKAY][NO] ............. - .......[NO] [OKAY].................... - [OKAY]transformersparse_attn ............ -............ [NO] [NO][NO] .............. [OKAY] -.......sparse_attn [OKAY][OKAY]............stochastic_transformer - -. transformer [NO] [NO] .......................... fused_lamb[NO][OKAY][OKAY] - -....... transformer[OKAY] -............ [NO]............. stochastic_transformer ....... [NO] .[OKAY]....... [NO] - ....... [OKAY][OKAY] - -stochastic_transformer . [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY][OKAY] - -[OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op name -op name op name op name ................................ ................ ................ installedinstalled installed installed .... .. compatiblecompatible..compatible - - - ----------------------------------------------------------------------------------------------------compatible-------------------------------------------------- - - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam.............................. ...............cpu_adam[YES][YES] [YES] ..................... ...... ...... [OKAY][YES] [OKAY] - -......[OKAY] -[OKAY] -fused_adam ............. fused_adam[NO] .................... fused_adam [NO] fused_adam[OKAY] ............. -.................... [NO][NO][OKAY] ....... -fused_lamb....... [OKAY]fused_lamb[OKAY]............. - - .............[NO] [NO]....... .......fused_lamb[OKAY]fused_lamb [OKAY] -.......................... - [NO][NO] ....... .......[OKAY] -[OKAY] -sparse_attn ............sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformersparse_attn sparse_attn............transformer ............ ............[NO] ............ [NO] ....... [NO] [NO]....... [OKAY] ....... -.......[OKAY] -[OKAY][OKAY] - -stochastic_transformertransformer transformer .............stochastic_transformer ............ [NO][NO] [NO] ........ ....... ....... [NO][OKAY] [OKAY] -[OKAY] -....... - [OKAY]stochastic_transformer - stochastic_transformer .. [NO] [NO]....... .......[OKAY] - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -ninjaninjaninjaninja .................................... .................. ..................[OKAY] [OKAY] -[OKAY][OKAY] --------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op nameop name -op name op name ................ ................................ ................installed installed installed installed .. ...... compatible compatible -compatible -compatible-------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... cpu_adam[YES]cpu_adamcpu_adam ................................................... [OKAY][YES] -[YES][YES] .................. [OKAY][OKAY][OKAY] - -fused_adam - ............. [NO] ....... [OKAY] -fused_adam fused_adamfused_adamfused_lamb ............. ............. .......................... [NO][NO] [NO] [NO] .............. ....... ....... [OKAY][OKAY][OKAY] -[OKAY] - - -fused_lambfused_lamb fused_lamb.......................... .............[NO][NO]sparse_attn ....... [NO][OKAY]................... -.......[OKAY][NO] - [OKAY]....... -[OKAY] -transformer sparse_attn............ sparse_attn [NO]........................ sparse_attn ....... [NO] [NO]............[OKAY] ....... -[OKAY].......[NO] - stochastic_transformer[OKAY].......transformer - .[OKAY]............ transformer -[NO][NO] .......................... transformer [OKAY][OKAY][NO] - - ................... [NO][OKAY] -stochastic_transformer....... stochastic_transformer.[OKAY] [NO] -........ [OKAY]stochastic_transformer[NO] - ........ [OKAY] -[NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja-------------------------------------------------- - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja .................. .................. .................................... [OKAY][OKAY][OKAY] - - -[OKAY]---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op nameop name -op name op name................................................ ................installedinstalledinstalled installed...... .. compatiblecompatiblecompatible -compatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adamcpu_adamcpu_adam cpu_adam ............................................................ [YES] [YES][YES] [YES]............ ...... ...... [OKAY][OKAY][OKAY][OKAY] - - - -fused_adamfused_adamfused_adamfused_adam ....................................... [NO].............[NO][NO] .....................[NO] [OKAY][OKAY][OKAY]....... - - - [OKAY]fused_lamb - fused_lamb............. fused_lamb............. .............fused_lamb[NO][NO] [NO] .................... ....... ....... [OKAY][OKAY][NO][OKAY] - - -....... [OKAY] -sparse_attnsparse_attn ............sparse_attnsparse_attn............ ............ ............ [NO] [NO][NO][NO] .............. .............. [OKAY] [OKAY] -[OKAY][OKAY] - - -transformer transformertransformertransformer ............ ........................ ............ [NO] [NO][NO] [NO] ..................... ....... [OKAY][OKAY] [OKAY] - -[OKAY] - -stochastic_transformerstochastic_transformer stochastic_transformerstochastic_transformer . . .[NO] . [NO] [NO][NO].............. .............. [OKAY] [OKAY] -[OKAY][OKAY] - - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -.. [NO] ....... [OKAY] -utils ..................async_io [YES] ..................... [OKAY][NO] - ....... [NO]quantizer - .............. [NO] ....... [OKAY] -transformer_inference-------------------------------------------------- -.. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... [NO] .......async_io [NO] -............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] utils....... ..................[OKAY] -[YES] ...... [OKAY] -utils .................. quantizer[YES] .................... [NO][OKAY] -....... [OKAY]quantizer - .............. [NO] ....... --------------------------------------------------[OKAY] - --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'].................... - 1.8.1 -torch version torch cuda version.................... ...............1.8.1 -11.1 -torch cuda versionnvcc version .................................... 11.111.2 - -nvcc versiondeepspeed install path ................................ 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed install pathdeepspeed info .............................. 0.5.5+cd7967d, cd7967d, master['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed wheel compiled w.deepspeed info ......................... torch 1.8, cuda 11.10.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. ------------------------------------------------------------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -DeepSpeed C++/CUDA extension op report----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -------------------------------------------------------------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - - -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  async_io: please install the libaio-devel package with yum - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io async_io............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY] -[OKAY] -quantizerquantizer ............................ [NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY] -[OKAY]-------------------------------------------------- -[OKAY] - --------------------------------------------------- - -op name-------------------------------------------------- op name-------------------------------------------------- -................ - ................op nameinstalledop name installed.................................. ..compatibleinstalled -installed -------------------------------------------------- compatible.... - - --------------------------------------------------compatiblecompatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [YES]cpu_adam ...... ...............cpu_adam[OKAY] cpu_adam - [YES] .................................... [YES][YES][OKAY] -............fused_adam [OKAY][OKAY]............. - - [NO] ....... fused_adam[OKAY] -............. [NO] .......fused_lamb fused_adam [OKAY]............. - fused_adam ............. .............[NO] [NO]fused_lamb.......[NO] .......[OKAY].................... - [OKAY][NO] -[OKAY]fused_lamb -....... .............[OKAY] -fused_lamb[NO] .................... [NO][OKAY] sparse_attn - ................... [OKAY][NO] - sparse_attn....... ............[OKAY] -[NO] .......sparse_attn transformer [OKAY] -........................sparse_attn transformer [NO][NO] ............ ................... ....... [NO][NO] [OKAY] [OKAY] ....... - -....... [OKAY]transformerstochastic_transformer[OKAY] - -............ .[NO] transformerstochastic_transformer [NO] ....... ................... . [NO] [OKAY][OKAY][NO] - -.............. [OKAY] -[OKAY]stochastic_transformer - . [NO]stochastic_transformer ....... .[OKAY] -[NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -DeepSpeed general environment info:torch cuda version ............... - 11.1 -nvcc version .....................torch install path 11.2 -...............deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']deepspeed info - ................... 0.5.5+cd7967d, cd7967d, mastertorch version - deepspeed wheel compiled w..................... ......1.8.1 -torch 1.8, cuda 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] .......async_io [OKAY]............... - [NO] ....... [NO] -utils .................. [YES] ...... [OKAY] -quantizer transformer_inference.............. ..[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] ....... [OKAY] --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]async_io ....... [NO]............... - [NO] ....... [NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io async_io............... [NO]............... .......[NO] [NO]....... [NO] - -transformer_inference ..transformer_inference [NO] ....... ..[OKAY] -[NO] ....... [OKAY]utils - .................. [YES] ...... [OKAY]utils - .................. [YES] ......quantizer [OKAY].............. - [NO] ....... [OKAY] -quantizer .............. [NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - ............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utils transformer_inference.................. ..[YES] [NO]...... .......[OKAY] -[OKAY] -quantizer .............. [NO] utils....... ..................[OKAY] -[YES] ...... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO]async_io - ............... [NO] ....... [NO]transformer_inference - .. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -utils .................. [YES] transformer_inference...... ..[OKAY] -[NO] .......quantizer ..............[OKAY] -[NO] ....... [OKAY] -utils .................. --------------------------------------------------[YES] - ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] utils....... ..................[NO] -[YES] ...... [OKAY] -quantizer .............. [NO] .......transformer_inference [OKAY].. - [NO] ....... --------------------------------------------------[OKAY] - -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -............... torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch cuda version ...............torch version 11.1.................... - 1.8.1nvcc version - ..................... 11.2torch cuda version - ...............deepspeed install path 11.1........... - nvcc version['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -..................... deepspeed info11.2 -................... deepspeed install path0.5.5+cd7967d, cd7967d, master -........... deepspeed wheel compiled w. ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']...... - torch 1.8, cuda 11.1deepspeed info - ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninja .................................... [OKAY] [OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................ ................installed ..installed compatible.. - --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam ...............cpu_adam [YES]............... ...... [YES][OKAY] -...... [OKAY] -fused_adam ............. [NO]fused_adam .................... [OKAY][NO] - ....... [OKAY]fused_lamb - ............. [NO]fused_lamb .................... [OKAY] -[NO] ....... [OKAY] -sparse_attn ............ [NO] .......sparse_attn [OKAY] -............ [NO] .......transformer [OKAY]............ - [NO] ....... transformer[OKAY] -............ [NO]stochastic_transformer ....... .[OKAY] -[NO] ....... [OKAY]stochastic_transformer - . [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb .............ninja [NO] ......................... [OKAY][OKAY] - --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -sparse_attn ............ [NO] ....... [OKAY] -cpu_adamtransformer ........................... [YES] [NO]...... [OKAY]....... - [OKAY] -stochastic_transformer . [NO] .......fused_adam [OKAY] -............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] [WARNING]  async_io: please install the libaio-devel package with yum ....... [NO] - -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op name op nameop nameop name................ ................................................installed installed installed.. installed compatible .. -.. .. compatible --------------------------------------------------compatiblecompatible - - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -cpu_adam ...............cpu_adam cpu_adam[YES]cpu_adam .............................. ............... ......[YES] [YES] [YES] [OKAY] ............ -...... [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO] ....... fused_adamfused_adamfused_adam [OKAY] -....................................... fused_lamb [NO][NO][NO] ........................... [NO]....... [OKAY]....... - [OKAY][OKAY][OKAY] - -fused_lamb - ............. [NO]fused_lamb fused_lamb.................... .............[NO][OKAY] sparse_attn -.......[NO] ............[OKAY]....... - [OKAY][NO] - ....... [OKAY] -sparse_attntransformer ............ ............[NO]sparse_attn sparse_attn .......[NO] ............ ............[OKAY] - .......[NO][NO] transformer .......[OKAY] ....... -............ [OKAY] [OKAY][NO] - -stochastic_transformer .......transformertransformer . [OKAY] ........................ - [NO] [NO] [NO] stochastic_transformer....... ....... ........[OKAY] [OKAY] - -[NO][OKAY] -.......stochastic_transformer [OKAY] -.stochastic_transformer [NO] ........ [OKAY] -[NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w. deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ----------------------------------------------------------------------------------------------------- -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja .................. .................. ..................[OKAY] .................. - [OKAY][OKAY][OKAY]-------------------------------------------------- - - - ---------------------------------------------------op name --------------------------------------------------- -------------------------------------------------- - -................op name op nameop name installed .................................................. installed installedinstalledcompatible .. -.. ..--------------------------------------------------compatible -compatible - ---------------------------------------------------compatible-------------------------------------------------- - - ---------------------------------------------------cpu_adam - ............... [YES] ......cpu_adam cpu_adam[OKAY] -..............................cpu_adam [YES][YES]............... ............[YES] [OKAY]fused_adam [OKAY] - ...... -............. [OKAY][NO] - ....... [OKAY] -fused_adam .............fused_lambfused_adam [NO]............. fused_adam .................... [NO] ............. [NO] [OKAY]....... -[NO]....... .......[OKAY]fused_lamb[OKAY] - -.............[OKAY] -fused_lamb[NO] fused_lamb.................... [OKAY].............[NO] - [NO]sparse_attn....... ...................[OKAY] [OKAY] -[NO] - .......sparse_attn [OKAY] -............ [NO]transformer ................... [OKAY][NO] -sparse_attn sparse_attn....... transformer [OKAY]............ -............ ............ [NO][NO][NO] stochastic_transformer ....... .............. .[OKAY][OKAY] -[OKAY] -[NO] - .......transformer [OKAY]transformer stochastic_transformer -............ ............[NO] . [NO] ....... [NO] ....... [OKAY]....... - [OKAY][OKAY] - -stochastic_transformer stochastic_transformer. [NO]. .......[NO] [OKAY]....... - [OKAY] -ninjaninjaninja ninja.................. .................. ..................[OKAY].................. - [OKAY][OKAY][OKAY]-------------------------------------------------- - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- -op name - - op nameop name................op name ................................installed ................ installed installed .. installed.. .. compatible..compatible - -compatible -------------------------------------------------- --------------------------------------------------- -compatible-------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... cpu_adam[YES] cpu_adam............... ...... cpu_adam............... [YES] [OKAY] ...............[YES] -...... [YES]......[OKAY] -......[OKAY] - [OKAY]fused_adam - ............. [NO] ....... [OKAY] -fused_adam .............fused_lamb fused_adam [NO] ..........................fused_adam ....... [NO] [NO]............. [OKAY] -.......[NO]....... [OKAY] fused_lamb[OKAY] - ....... -............. [OKAY][NO] -fused_lamb .................... fused_lamb[NO][OKAY] -....................sparse_attn [OKAY][NO]............ - .......[NO] [OKAY]....... - [OKAY] -sparse_attntransformer ........................ [NO][NO] .......sparse_attn....... ............[OKAY][OKAY] - -sparse_attn[NO] transformer................... stochastic_transformer ............ [OKAY][NO] -. [NO].......[NO] transformer[OKAY]....... - ...................[OKAY] [NO] -transformer[OKAY] ....... -............ stochastic_transformer [OKAY][NO] - ....... .[OKAY] stochastic_transformer -[NO] ........stochastic_transformer [NO] [OKAY]........ - [NO][OKAY] -....... [OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - - - -JIT compiled ops requires ninja--------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja .................. ...................................................... [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................................................ ................ installed installed installedinstalled .. .. ..compatible .. - compatiblecompatible-------------------------------------------------- - -compatible ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... cpu_adamcpu_adam[YES]cpu_adam ............... ..................... ...............[YES][OKAY] [YES]...... -[YES] ......[OKAY]...... - [OKAY][OKAY] - -fused_adam .............fused_adam fused_adam .............[NO]fused_adam [NO]................................. .......[NO][OKAY][NO] -[OKAY] ....... -....... [OKAY]fused_lamb[OKAY] -fused_lamb -............. .............fused_lamb[NO] fused_lamb [NO]....... .......................... [OKAY] ....... -[NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -sparse_attn ............sparse_attn [NO]sparse_attnsparse_attn............ ....... ............ ............[NO][OKAY][NO] - .......[NO]....... transformer[OKAY].......[OKAY] - -............[OKAY]transformer transformer - [NO] transformer........................ [NO][NO]................... .......[NO]....... [OKAY] [OKAY] -[OKAY] -....... - stochastic_transformer[OKAY] -stochastic_transformerstochastic_transformer. [NO]..stochastic_transformer .......[NO] [NO]. [OKAY]....... .......[NO] - [OKAY][OKAY]....... - - [OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.1 - nvcc version11.1 -..................... nvcc version11.2 -..................... deepspeed install path11.2 -...........deepspeed install path ...........['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+cd7967d, cd7967d, master -0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - - -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY] -[OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- -op name -op name op name ................ op name ................................ installed installed ................ installed .... installed ..compatiblecompatible.. - - --------------------------------------------------compatible ---------------------------------------------------compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam............... cpu_adam............... cpu_adam [YES]...............[YES] [YES]..................... ...... ...... [OKAY] [OKAY][YES] -[OKAY] -...... - [OKAY] -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY][OKAY] -[OKAY]-------------------------------------------------- - - - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - -fused_adamfused_adam fused_adam .............fused_adam ............. ............. [NO] .............[NO][NO] .......[NO]....... ....... [OKAY] [OKAY] -[OKAY]....... - -op name................op name op name ................installed ................ ................ installed installed....installed compatible..compatible.. - -------------------------------------------------- - compatible ---------------------------------------------------compatible - - ----------------------------------------------------------------------------------------------------- - - [OKAY] -cpu_adam ............... cpu_adam[YES] cpu_adam...............cpu_adam...... ...............[YES] ............... [OKAY]......[YES] [YES][OKAY]...... - - ......[OKAY] -fused_lambfused_lamb fused_lamb............. fused_lamb.............[NO] .............[NO]............. .......[NO].......[NO] [OKAY].......[OKAY]....... -[OKAY] - -[OKAY] -[OKAY] -fused_adam fused_adam.............fused_adam fused_adam[NO] ............. .................... ............. [OKAY][NO][NO] -sparse_attnsparse_attnsparse_attn sparse_attn ........................ ............[NO] ............ [NO] [NO]....... [NO][OKAY] ....... - [NO]....... .......[OKAY]fused_lamb....... - .............[OKAY] -..............[OKAY] -[OKAY]transformer - [OKAY]transformer -fused_lamb[NO][OKAY] -............ transformer ............ ............[NO]transformer[NO] .......[NO]................... [OKAY].......[OKAY][NO] - -fused_lamb.................... fused_lamb.............[NO] [OKAY]....... [OKAY]............. -[NO] - [NO]....... ....... [OKAY][OKAY] - .......[OKAY] -[OKAY]stochastic_transformer - -stochastic_transformer stochastic_transformer. . stochastic_transformer[NO].[NO] ..............[NO] .[OKAY] [OKAY]....... - -sparse_attnsparse_attn ............ ............[NO] .......[NO] sparse_attn[OKAY]....... -[NO] [OKAY]....... - [OKAY] - ............[OKAY]sparse_attn - transformer[NO] ............transformer................... [NO][NO] ................... [OKAY] [NO].......[OKAY] - -....... transformer[OKAY] [OKAY] -............stochastic_transformer -stochastic_transformer transformer [NO]. ....................[NO] [NO] ....... [NO] .......[OKAY] [OKAY] -.......[OKAY] - -stochastic_transformer[OKAY] - . [NO]stochastic_transformer ....... [OKAY]. - [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja ninja.................. [OKAY].................. - [OKAY]-------------------------------------------------- - ---------------------------------------------------op name - ................op name installed................ ..installed compatible -.. --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam ............... cpu_adam[YES] ..................... [YES][OKAY] -...... [OKAY] -fused_adam ............. [NO]fused_adam .................... [OKAY][NO] - ....... [OKAY]fused_lamb - ............. [NO] fused_lamb....... .............[OKAY] -[NO] ....... [OKAY] -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -transformer ............transformer [NO]............ .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer stochastic_transformer . [NO]. .......[NO] [OKAY]....... - [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... ninja[OKAY] - .................. [OKAY] --------------------------------------------------- -op name ................ installed ..sparse_attn compatible............ - [NO]-------------------------------------------------- -....... [OKAY] -transformer ............ cpu_adam[NO] ...................... [YES][OKAY] -...... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. ......................................................[OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------- -op name - - op nameop name................op name ................installed................ ................ .. installedinstalled installed ..compatible compatible.. - -.. -------------------------------------------------- -------------------------------------------------- -compatible -compatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... ...............[YES]cpu_adam ......cpu_adam[YES] .............................. [OKAY]...... - [YES][YES][OKAY] -............ [OKAY][OKAY] - -fused_adam ............. fused_adam[NO] .................... [NO]fused_adamfused_adam[OKAY] -.................... ............. [OKAY]fused_lamb [NO] - [NO] ............. fused_lamb.......[NO]....... ....................[OKAY][OKAY] -[NO] -[OKAY] -....... [OKAY] -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attntransformertransformer sparse_attn ........................ ............ [NO]............ [NO].......[NO] [OKAY]....... -[NO]....... [OKAY]stochastic_transformer[OKAY]....... - - [OKAY]. -transformer [NO]stochastic_transformertransformer ............ ........ ............[NO] [OKAY] -[NO][NO] ..................... [OKAY][OKAY][OKAY] - - -stochastic_transformer stochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -op name--------------------------------------------------op name op name - ................ ................op name ................ installed installed..................installed compatible.... installed -compatible -------------------------------------------------- -compatible.. --------------------------------------------------- - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adam ...............cpu_adam [YES]...............cpu_adam cpu_adam......[YES] ...............[OKAY]............... ...... - [YES] [YES] [OKAY] ...... -...... [OKAY] -[OKAY] -fused_adam ............. [NO] ....... fused_adam[OKAY]fused_adam -fused_adam ............. ............. ............. [NO]fused_lamb[NO] .................... [NO] .......[OKAY][NO]....... ....... -[OKAY] -[OKAY][OKAY] - -fused_lambfused_lamb ............. .............[NO] fused_lamb [NO] ....... ............. ....... [NO][OKAY]sparse_attn - [OKAY]................... - [NO][OKAY] -....... [OKAY] -transformersparse_attn ........................sparse_attn [NO][NO] ............ sparse_attn.............. ............ [OKAY][NO] [OKAY] - -[NO].......transformer [OKAY]................... -stochastic_transformer [NO][OKAY]transformer . - ...................[NO]transformer [OKAY] [NO]....... -............ .......[OKAY][NO] stochastic_transformer - .......[OKAY] -.[OKAY] -[NO]stochastic_transformer stochastic_transformer ....... .[OKAY]. - [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed infoDeepSpeed general environment info: ................... -0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. torch install path...... torch 1.8, cuda 11.1 -............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] .......async_io [NO] - ............... [NO] ....... [NO] -transformer_inference .. [NO] .......transformer_inference [OKAY] -.. [NO] ....... utils[OKAY] -.................. [YES] ...... [OKAY]utils - .................. [YES] quantizer...... ..............[OKAY] -[NO] ....... [OKAY]quantizer - .............. [NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1504412.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.02 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES]transformer_inference ........ [OKAY][NO] - ....... [OKAY] -quantizer .............. [NO] .......utils [OKAY].................. - [YES] ...... [OKAY]-------------------------------------------------- - -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version .....................DeepSpeed general environment info: 11.2 - -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']torch install path - deepspeed info............... ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ......['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -/bin/sh: line 0: type: git: not found -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY] -....... [OKAY] -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer .............. quantizer[NO] ..................... [NO] [OKAY]....... - [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ...............['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -1.8.1 -torch cuda versiontorch version ................................... 11.11.8.1 - -nvcc version torch cuda version..................... ...............11.2 -11.1deepspeed install path - nvcc version........... ..................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']11.2 - -deepspeed infodeepspeed install path ................... ...........0.5.5+cd7967d, cd7967d, master -['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']deepspeed wheel compiled w. - ......deepspeed info torch 1.8, cuda 11.1................... - 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -> setting tensorboard ... -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-10 10:44:52,568] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/data' ->>> done with dataset index builder. Compilation time: 0.301 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 17.799 seconds -time to initialize megatron (seconds): 22.805 -[after megatron is initialized] datetime: 2021-10-10 10:45:10 -building GPT model ... -[2021-10-10 10:45:10,856] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-10 10:45:10,857] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-10 10:45:10,857] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.02 GB, percent = 20.3% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-10 10:45:12,527] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 -[2021-10-10 10:45:13,240] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-10 10:45:13,241] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.9 GB CA 1.91 GB Max_CA 2 GB -[2021-10-10 10:45:13,241] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.21 GB, percent = 20.4% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-10 10:45:13,242] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+cd7967d, git-hash=cd7967d, git-branch=master -[2021-10-10 10:45:13,279] [INFO] [engine.py:204:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-10 10:45:13,279] [INFO] [engine.py:848:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-10 10:45:13,279] [INFO] [engine.py:854:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-10 10:45:13,279] [INFO] [engine.py:870:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-10 10:45:13,280] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-10 10:45:13,280] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-10 10:45:13,280] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-10 10:45:13,280] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-10 10:45:13,280] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-10 10:45:13,280] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -[2021-10-10 10:45:15,114] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -[2021-10-10 10:45:15,114] [INFO] [utils.py:807:see_memory_usage] MA 5.48 GB Max_MA 7.3 GB CA 9.25 GB Max_CA 9 GB -[2021-10-10 10:45:15,115] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.23 GB, percent = 20.4% -[2021-10-10 10:45:15,167] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-10 10:45:15,168] [INFO] [utils.py:807:see_memory_usage] MA 12.77 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-10 10:45:15,168] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.23 GB, percent = 20.4% -[2021-10-10 10:45:15,168] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-10 10:45:15,203] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-10 10:45:15,204] [INFO] [utils.py:807:see_memory_usage] MA 12.77 GB Max_MA 12.77 GB CA 20.19 GB Max_CA 20 GB -[2021-10-10 10:45:15,204] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.23 GB, percent = 20.4% -[2021-10-10 10:45:15,204] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-10 10:45:15,204] [INFO] [engine.py:596:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-10 10:45:15,204] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-10 10:45:15,204] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-10 10:45:15,204] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-10 10:45:15,205] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-10 10:45:15,206] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-10 10:45:15,207] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-10 10:45:15,207] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-10 10:45:15,207] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-10 10:45:15,207] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -[2021-10-10 10:45:15,207] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,595] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -WARNING: could not find the metadata file /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - will not load any checkpoints and will start from random -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,682] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,683] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,684] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,684] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,684] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 10:45:15,684] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -time (ms) | load-checkpoint: 3.77 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 125.2213504 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432estimated model parameters: 125.22432estimated model parameters: 125.22432 - - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.368064estimated model parameters without embeddings: 103.368064 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.368064estimated model parameters without embeddings: 103.368064 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-10 10:45:15 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 3000320 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.140790 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > WARNING: could not find index map files, building the indices on rank 0 ... - > last epoch number of samples (73851107) is smaller than 80% of number of samples per epoch (131537223), setting separate_last_epoch to True - > elasped time to build and save doc-idx mapping (seconds): 126.075045 - using: - number of documents: 288714672 - number of epochs: 5 - sequence length: 2048 - total number of samples: 657686116 - > elasped time to build and save sample-idx mapping (seconds): 37.277918 - > building shuffle index with split [0, 526148893) and [526148893, 657686116) ... - > elasped time to build and save shuffle-idx mapping (seconds): 44.179906 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.107 seconds - total number of samples: 657686117 - total number of epochs: 5 - > WARNING: could not find index map files, building the indices on rank 0 ... - > only one epoch required, setting separate_last_epoch to False - > elasped time to build and save doc-idx mapping (seconds): 1.007942 - using: - number of documents: 15211521 - number of epochs: 1 - sequence length: 2048 - total number of samples: 6927160 - > elasped time to build and save sample-idx mapping (seconds): 0.383493 - > building shuffle index with split [0, 6927160) and [6927160, 6927160) ... - > elasped time to build and save shuffle-idx mapping (seconds): 0.321055 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.043 seconds - total number of samples: 6927161 - total number of epochs: 1 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.034 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-10 10:48:50 -done with setup ... -training ... -time (ms) | model-and-optimizer-setup: 4882.59 | train/valid/test-data-iterators-setup: 213714.07 -Number of parameters: 125.2213504 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billionNumber of parameters: 125.22432 billion - -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billionNumber of parameters without embeddings: 103.368064 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 125.2213504 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-10 10:48:50 -[2021-10-10 10:48:50,114] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-10 10:48:50,114] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-10 10:48:50,115] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-10 10:48:50,115] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-10 10:48:50,115] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -Traceback (most recent call last): -Traceback (most recent call last): -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 246, in - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 246, in - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 246, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 246, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 165, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 165, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 165, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 165, in pretrain - iteration = train(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 732, in train -iteration = train(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 732, in train -iteration = train(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 732, in train -iteration = train(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 732, in train - train_step(forward_step_func, -train_step(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 405, in train_step - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 405, in train_step - loss = model[0].train_batch(data_iter=data_iterator)loss = model[0].train_batch(data_iter=data_iterator) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 329, in train_batch - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 329, in train_batch - train_step(forward_step_func,train_step(forward_step_func, - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 405, in train_step - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 405, in train_step - loss = model[0].train_batch(data_iter=data_iterator) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 329, in train_batch - loss = model[0].train_batch(data_iter=data_iterator) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 329, in train_batch - self._exec_schedule(sched) -self._exec_schedule(sched) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 1313, in _exec_schedule - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 1313, in _exec_schedule - self._exec_schedule(sched) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 1313, in _exec_schedule - self._exec_schedule(sched) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 1313, in _exec_schedule - self._exec_instr(**cmd.kwargs)self._exec_instr(**cmd.kwargs) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 631, in _exec_forward_pass - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 631, in _exec_forward_pass - outputs = super().forward(inputs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/engine.py", line 1321, in forward - outputs = super().forward(inputs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/engine.py", line 1321, in forward - self._exec_instr(**cmd.kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 631, in _exec_forward_pass - self._exec_instr(**cmd.kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 631, in _exec_forward_pass - outputs = super().forward(inputs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/engine.py", line 1321, in forward - outputs = super().forward(inputs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/engine.py", line 1321, in forward - loss = self.module(*inputs, **kwargs) -loss = self.module(*inputs, **kwargs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - loss = self.module(*inputs, **kwargs) File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - loss = self.module(*inputs, **kwargs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - result = self.forward(*input, **kwargs) File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 352, in forward - -result = self.forward(*input, **kwargs) -result = self.forward(*input, **kwargs) File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 352, in forward - - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 352, in forward - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 352, in forward - x = self.activation_checkpoint_func( - x = self.activation_checkpoint_func( File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 743, in checkpoint - - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 743, in checkpoint - x = self.activation_checkpoint_func(x = self.activation_checkpoint_func( - - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 743, in checkpoint - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 743, in checkpoint - CheckpointFunction.apply(function, all_outputs, *args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 582, in forward - CheckpointFunction.apply(function, all_outputs, *args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 582, in forward - CheckpointFunction.apply(function, all_outputs, *args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 582, in forward - CheckpointFunction.apply(function, all_outputs, *args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 582, in forward - outputs = run_function(*inputs_cuda) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 330, in exec_func - outputs = run_function(*inputs_cuda) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 330, in exec_func - outputs = run_function(*inputs_cuda) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 330, in exec_func - outputs = run_function(*inputs_cuda) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 330, in exec_func - inputs = layer(inputs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - inputs = layer(inputs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - inputs = layer(inputs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - inputs = layer(inputs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 588, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 588, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 588, in forward - return super().forward(hidden_states, attention_mask, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 479, in forward - return super().forward(hidden_states, attention_mask, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 479, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 588, in forward - return super().forward(hidden_states, attention_mask, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 479, in forward -self.self_attention(layernorm_output, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - self.self_attention(layernorm_output, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - return super().forward(hidden_states, attention_mask, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 479, in forward - self.self_attention(layernorm_output, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 333, in forward -result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 333, in forward - self.self_attention(layernorm_output, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - attention_probs = self.scale_mask_softmax(attention_scores,attention_probs = self.scale_mask_softmax(attention_scores, - - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 333, in forward - attention_probs = self.scale_mask_softmax(attention_scores, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs)result = self.forward(*input, **kwargs) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 146, in forward - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 146, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 333, in forward - attention_probs = self.scale_mask_softmax(attention_scores, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 146, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 146, in forward - probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale) -probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale) File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 34, in forward - -probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 34, in forward - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 34, in forward - probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 34, in forward - softmax_results = scaled_upper_triang_masked_softmax_cuda.forward( - softmax_results = scaled_upper_triang_masked_softmax_cuda.forward( -RuntimeError: attn_batches % batches_per_block == 0 INTERNAL ASSERT FAILED at "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h":363, please report a bug to PyTorch. RuntimeError -: attn_batches % batches_per_block == 0 INTERNAL ASSERT FAILED at "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h":363, please report a bug to PyTorch. - softmax_results = scaled_upper_triang_masked_softmax_cuda.forward( - softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(RuntimeError -: attn_batches % batches_per_block == 0 INTERNAL ASSERT FAILED at "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h":363, please report a bug to PyTorch. -RuntimeError: attn_batches % batches_per_block == 0 INTERNAL ASSERT FAILED at "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h":363, please report a bug to PyTorch. -Killing subprocess 1304934 -Killing subprocess 1304935 -Killing subprocess 1304936 -Killing subprocess 1304937 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1504412.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -srun: error: r7i4n4: task 0: Exited with exit code 1 -srun: Terminating job step 1504412.0 -Killing subprocess 580951 -Killing subprocess 577240 -Killing subprocess 579564 -Killing subprocess 577241 -Killing subprocess 580952 -Killing subprocess 579565 -Killing subprocess 579566 -Killing subprocess 577242 -Killing subprocess 580953 -Killing subprocess 577243 -Killing subprocess 579811 -Main process received SIGTERM, exiting -Killing subprocess 580571 -Killing subprocess 579812 -Killing subprocess 579567 -Main process received SIGTERM, exiting -Killing subprocess 580572 -Killing subprocess 584977 -Killing subprocess 580955 -Main process received SIGTERM, exiting -Killing subprocess 580161 -Killing subprocess 579784 -Killing subprocess 584978 -Killing subprocess 579813 -Killing subprocess 580573 -Killing subprocess 579814 -Killing subprocess 580575 -Killing subprocess 787351 -Main process received SIGTERM, exiting -Killing subprocess 580162 -Killing subprocess 216514 -Killing subprocess 584979 -Main process received SIGTERM, exiting -Killing subprocess 579785 -Killing subprocess 787352 -Killing subprocess 641762 -Killing subprocess 584980 -Killing subprocess 590013 -Killing subprocess 787353 -Killing subprocess 580163 -Killing subprocess 216515 -Killing subprocess 580164 -Killing subprocess 641763 -Killing subprocess 216516 -Killing subprocess 787354 -Killing subprocess 590396 -Killing subprocess 579786 -Killing subprocess 579787 -Killing subprocess 174114 -Killing subprocess 641764 -Killing subprocess 590014 -Killing subprocess 216517 -Main process received SIGTERM, exiting -Killing subprocess 175003 -Main process received SIGTERM, exiting -Killing subprocess 174084 -Killing subprocess 590527 -Killing subprocess 590397 -Killing subprocess 641765 -Killing subprocess 174115 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 590015 -Killing subprocess 590016 -Killing subprocess 174085 -Main process received SIGTERM, exiting -Killing subprocess 175004 -Killing subprocess 590398 -Killing subprocess 175005 -Killing subprocess 590528 -Killing subprocess 590399 -Killing subprocess 174116 -Killing subprocess 174117 -Main process received SIGTERM, exiting -Killing subprocess 174086 -Killing subprocess 175006 -Killing subprocess 174228 -Killing subprocess 174974 -Main process received SIGTERM, exiting -Killing subprocess 590529 -Killing subprocess 590530 -Killing subprocess 173925 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 174975 -Killing subprocess 174087 -Main process received SIGTERM, exiting -Killing subprocess 174229 -Main process received SIGTERM, exiting -Killing subprocess 173926 -Killing subprocess 173939 -Killing subprocess 174976 -Killing subprocess 174230 -Killing subprocess 174977 -Killing subprocess 174231 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 174134 -Killing subprocess 173927 -Killing subprocess 173928 -Main process received SIGTERM, exiting -Killing subprocess 174521 -Killing subprocess 173746 -Killing subprocess 173940 -Killing subprocess 174135 -Killing subprocess 174458 -Killing subprocess 174522 -Killing subprocess 174075 -Killing subprocess 174523 -Killing subprocess 173747 -Killing subprocess 173941 -Main process received SIGTERM, exiting -Killing subprocess 174459 -Killing subprocess 173943 -Killing subprocess 174136 -Killing subprocess 173181 -Killing subprocess 174137 -Killing subprocess 173748 -Killing subprocess 172765 -Killing subprocess 173748 -Killing subprocess 173141 -Killing subprocess 174524 -Killing subprocess 174076 -Killing subprocess 173749 -Killing subprocess 174460 -Killing subprocess 174077 -Killing subprocess 173182 -Killing subprocess 172766 -Killing subprocess 174461 -Killing subprocess 173142 -Killing subprocess 173183 -Killing subprocess 172767 -Killing subprocess 173749 -Main process received SIGTERM, exiting -Killing subprocess 173750 -Killing subprocess 173143 -Killing subprocess 173184 -Killing subprocess 172768 -Main process received SIGTERM, exiting -Killing subprocess 173751 -Killing subprocess 173144 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 174079 -Killing subprocess 173760 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 173761 -Main process received SIGTERM, exiting -Killing subprocess 173762 -Killing subprocess 173763 -Main process received SIGTERM, exiting -srun: error: r7i6n0: task 14: Exited with exit code 1 -srun: error: r7i5n7: task 12: Exited with exit code 1 -srun: error: r7i5n8: task 13: Exited with exit code 1 -srun: error: r9i6n5: task 19: Exited with exit code 1 -srun: error: r9i6n2: task 16: Exited with exit code 1 -srun: error: r9i7n2: task 25: Exited with exit code 1 -srun: error: r9i6n1: task 15: Exited with exit code 1 -srun: error: r9i6n6: task 20: Exited with exit code 1 -srun: error: r9i6n7: task 21: Exited with exit code 1 -srun: error: r7i5n0: task 5: Exited with exit code 1 -srun: error: r9i6n3: task 17: Exited with exit code 1 -srun: error: r9i7n4: task 27: Exited with exit code 1 -srun: error: r9i7n0: task 23: Exited with exit code 1 -srun: error: r7i4n6: task 2: Exited with exit code 1 -srun: error: r9i6n4: task 18: Exited with exit code 1 -srun: error: r7i4n8: task 4: Exited with exit code 1 -srun: error: r7i5n1: task 6: Exited with exit code 1 -srun: error: r9i7n5: task 28: Exited with exit code 1 -srun: error: r9i6n8: task 22: Exited with exit code 1 -srun: error: r7i5n4: task 9: Exited with exit code 1 -srun: error: r7i5n6: task 11: Exited with exit code 1 -srun: error: r9i7n1: task 24: Exited with exit code 1 -srun: error: r7i4n7: task 3: Exited with exit code 1 -srun: error: r7i5n5: task 10: Exited with exit code 1 -srun: error: r7i5n3: task 8: Exited with exit code 1 -srun: error: r9i7n3: task 26: Exited with exit code 1 -srun: error: r9i7n8: task 31: Exited with exit code 1 -srun: error: r7i5n2: task 7: Exited with exit code 1 -srun: error: r9i7n7: task 30: Exited with exit code 1 -srun: error: r9i7n6: task 29: Exited with exit code 1 -srun: error: r7i4n5: task 1: Exited with exit code 1 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja ..................-------------------------------------------------- [OKAY] - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- -op nameNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -................-------------------------------------------------- -installedJIT compiled ops requires ninja -.. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -JIT compiled ops requires ninja -cpu_adam ............... [YES] ...... [OKAY] -ninja .................. [OKAY] -fused_adam ............. [NO] ....... [OKAY] --------------------------------------------------- -fused_lamb ............. [NO] ....... [OKAY] -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] --------------------------------------------------- -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -op name ................ installed .. compatible --------------------------------------------------- -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -cpu_adamninja .................. [OKAY] --------------------------------------------------- -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -op name ................ ...............installed .. compatible -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -[YES] ...... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_adamfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb ............. [NO] ....... [OKAY] -sparse_attnsparse_attn ............ [NO] ....... ............[OKAY] -[NO] .......transformer [OKAY]............ - [NO] transformer....... ............[OKAY] -[NO] ....... [OKAY]stochastic_transformer - . [NO]stochastic_transformer ....... [OKAY]. - [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -ninja .................. [OKAY] -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- --------------------------------------------------- -op name ................ installed .. compatible -op name ................ installed .. compatible --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. --------------------------------------------------[OKAY] - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------op name - NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op................. - --------------------------------------------------installed - JIT compiled ops requires ninja.. - compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] ---------------------------------------------------fused_adam -............. DeepSpeed C++/CUDA extension op report[NO] - .......-------------------------------------------------- -[OKAY] -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -ninja .................. [OKAY] -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -sparse_attn ............ [NO] ....... [OKAY] -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -transformer ............ [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] --------------------------------------------------....... [OKAY] - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -op name ................ installed .. compatible --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adamninja ............... ..................[YES] ......[OKAY] [OKAY] - --------------------------------------------------- -op name ................ installedfused_adam ............... compatible[NO] -.......-------------------------------------------------- -[OKAY] -fused_lamb ............. [NO] cpu_adam....... [OKAY]............... - [YES] ...... [OKAY] -sparse_attn ............ fused_adam[NO] .................... [NO][OKAY] - .......transformer [OKAY]............ -[NO] ....... [OKAY]fused_lamb - .............stochastic_transformer [NO] ........ [NO][OKAY] -....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam-------------------------------------------------- ............. -[NO] DeepSpeed C++/CUDA extension op report....... - [OKAY]-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -fused_lamb-------------------------------------------------- -.............JIT compiled ops requires ninja -[NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO]-------------------------------------------------- -DeepSpeed C++/CUDA extension op report....... - [OKAY]-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_ioutils ................................. [NO][YES] ............. [NO][OKAY] - -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. [NO]-------------------------------------------------- -....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -transformer_inference .. [NO] ....... [OKAY] -utils ..................  [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.[YES] ...... -[OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... async_io[NO] ...................... [NO] -[NO] ....... [NO] -transformer_inference .. [NO] .......transformer_inference [OKAY].. - [NO] ....... utils[OKAY] -.................. [YES] ...... [OKAY] -utils .................. [YES]quantizer .................... [OKAY][NO] - ....... [OKAY] -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -DeepSpeed general environment info: -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch version .................... 1.8.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -async_io ............... async_io[NO] ...................... [NO][NO] -....... [NO] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -utils utils.................. ..................[YES] [YES]...... ......[OKAY] -[OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -async_io ............... [NO] ....... [NO] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io async_io............... [NO]............... .......[NO] [NO]....... - [NO] -transformer_inferencetransformer_inference .... [NO][NO] ....... .......[OKAY] -[OKAY] -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[YES] [OKAY]...... - [OKAY] -quantizer utils.............. ..................[NO] [YES]....... ......[OKAY] -[OKAY] ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: - [WARNING]  async_io: please install the libaio-devel package with yum -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -utils .................. [YES] ...... [OKAY] -async_io ............... [NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  async_io: please install the libaio-devel package with yum ............... [NO] - ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install pathDeepSpeed general environment info: ............... -torch install path['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -............... torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch cuda version ...............torch version 11.1.................... - nvcc version1.8.1 -..................... torch cuda version11.2 -...............deepspeed install path 11.1........... - nvcc version ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']..................... - deepspeed info11.2 -...................deepspeed install path 0.5.5+cd7967d, cd7967d, master........... - deepspeed wheel compiled w. ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']...... - deepspeed infotorch 1.8, cuda 11.1 -................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - [WARNING]  async_io: please install the libaio-devel package with yum -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']DeepSpeed general environment info: -deepspeed info -................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w.torch install path ..................... torch 1.8, cuda 11.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - - [WARNING]  async_io: please install the libaio-devel package with yum -torch install path ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch version - .................... torch version1.8.1 -.................... torch cuda version1.8.1 -............... torch cuda version11.1 -...............nvcc version 11.1..................... - nvcc version11.2 -.....................deepspeed install path 11.2........... - deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']........... - deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -................... deepspeed info0.5.5+cd7967d, cd7967d, master -................... deepspeed wheel compiled w.0.5.5+cd7967d, cd7967d, master -......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference ..async_io [NO]............... .......[NO] [OKAY]....... - [NO] -utils .................. [YES] ...... [OKAY] -transformer_inference .. [NO]quantizer ..................... [OKAY][NO] -DeepSpeed general environment info: - ....... [OKAY] -utils ..................-------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -[YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - ............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utilstransformer_inference .................... [YES][NO] ............. [OKAY][OKAY] - -quantizer ..............utils [NO].................. .......[YES] [OKAY]...... - [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -DeepSpeed general environment info: -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed infodeepspeed info ................... ...................0.5.5+cd7967d, cd7967d, master -0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[YES] [YES]...... ......[OKAY] -[OKAY] -quantizer .............. quantizer[NO] .............. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version torch install path.................... ...............1.8.1 -torch cuda version ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']11.1 - -nvcc version torch version..................... ....................11.2 -1.8.1deepspeed install path - ........... torch cuda version ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']............... - deepspeed info11.1 -...................nvcc version 0.5.5+cd7967d, cd7967d, master..................... - deepspeed wheel compiled w.11.2 -......deepspeed install path torch 1.8, cuda 11.1........... - ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -DeepSpeed general environment info: -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -quantizer .............. [NO] ....... [OKAY] -torch cuda version ............... 11.1 --------------------------------------------------- -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -/bin/sh: line 0: type: git: not found -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -/bin/sh: line 0: type: git: not found -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ...............utils [NO] ......................... [YES][NO] -...... [OKAY] -quantizer .............. [NO] ....... transformer_inference[OKAY] -.. [NO] ....... [OKAY] --------------------------------------------------- -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -DeepSpeed general environment info: -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -DeepSpeed general environment info: -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -nvcc version ..................... 11.2 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path -............... torch install path ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']............... - torch version .................... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']1.8.1 - -torch cuda versiontorch version ................................... 11.11.8.1 - -nvcc version torch cuda version..................... ...............11.2 -11.1deepspeed install path - nvcc version........... ..................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']11.2 - -deepspeed infodeepspeed install path .............................. 0.5.5+cd7967d, cd7967d, master -['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']deepspeed wheel compiled w. - deepspeed info...... ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -/bin/sh: line 0: type: git: not found -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch versiontorch install path .................... ...............1.8.1 -torch cuda version ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']11.1 - -nvcc version torch version..................... ....................11.2 -1.8.1 -deepspeed install path ...........torch cuda version ...............['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -11.1deepspeed info - nvcc version................... .....................0.5.5+cd7967d, cd7967d, master -11.2 -deepspeed wheel compiled w. deepspeed install path...... ...........torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - [WARNING]  async_io: please install the libaio-devel package with yum - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True -async_io ............... [NO] ....... [NO] - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False -transformer_inference .. [NO] ....... [OKAY] - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1504567.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False -utils .................. [YES] ...... [OKAY] - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False -quantizer .............. [NO] ....... [OKAY] - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 --------------------------------------------------- - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.02 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... False - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -/bin/sh: line 0: type: git: not found -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -> setting tensorboard ... -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-10 11:10:49,718] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/data' ->>> done with dataset index builder. Compilation time: 0.298 seconds -WARNING: constraints for invoking optimized fused softmax kernel are not met. We default back to unfused kernel invocations. -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 4.757 seconds -time to initialize megatron (seconds): 30.775 -[after megatron is initialized] datetime: 2021-10-10 11:10:54 -building GPT model ... -[2021-10-10 11:10:54,842] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-10 11:10:54,843] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-10 11:10:54,843] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.02 GB, percent = 20.3% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-10 11:10:56,521] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 -[2021-10-10 11:10:57,262] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-10 11:10:57,263] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.9 GB CA 1.91 GB Max_CA 2 GB -[2021-10-10 11:10:57,263] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.2 GB, percent = 20.4% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-10 11:10:57,264] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+cd7967d, git-hash=cd7967d, git-branch=master -[2021-10-10 11:10:57,304] [INFO] [engine.py:204:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-10 11:10:57,304] [INFO] [engine.py:848:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-10 11:10:57,304] [INFO] [engine.py:854:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-10 11:10:57,304] [INFO] [engine.py:870:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-10 11:10:57,305] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-10 11:10:57,305] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-10 11:10:57,305] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-10 11:10:57,305] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-10 11:10:57,305] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-10 11:10:57,305] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -[2021-10-10 11:10:59,122] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -[2021-10-10 11:10:59,123] [INFO] [utils.py:807:see_memory_usage] MA 5.48 GB Max_MA 7.3 GB CA 9.25 GB Max_CA 9 GB -[2021-10-10 11:10:59,123] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.22 GB, percent = 20.4% -[2021-10-10 11:10:59,169] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-10 11:10:59,169] [INFO] [utils.py:807:see_memory_usage] MA 12.77 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-10 11:10:59,170] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.22 GB, percent = 20.4% -[2021-10-10 11:10:59,170] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-10 11:10:59,198] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-10 11:10:59,199] [INFO] [utils.py:807:see_memory_usage] MA 12.77 GB Max_MA 12.77 GB CA 20.19 GB Max_CA 20 GB -[2021-10-10 11:10:59,199] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.22 GB, percent = 20.4% -[2021-10-10 11:10:59,199] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-10 11:10:59,199] [INFO] [engine.py:596:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-10 11:10:59,199] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-10 11:10:59,199] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-10 11:10:59,199] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-10 11:10:59,200] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-10 11:10:59,201] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-10 11:10:59,202] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-10 11:10:59,202] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-10 11:10:59,202] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-10 11:10:59,202] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -[2021-10-10 11:10:59,202] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,591] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -WARNING: could not find the metadata file /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - will not load any checkpoints and will start from random -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,695] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-10 11:10:59,696] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -time (ms) | load-checkpoint: 2.23 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064estimated model parameters without embeddings: 103.368064 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-10 11:10:59 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 3000320 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.035739 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.123 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.107 seconds - total number of samples: 6927161 - total number of epochs: 1 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.031 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-10 11:11:05 -done with setup ... -training ... -Number of parameters: 125.2213504 billion -time (ms) | model-and-optimizer-setup: 4922.44 | train/valid/test-data-iterators-setup: 4880.27 -Number of parameters: 125.2213504 billionNumber of parameters: 125.2213504 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 125.22432 billionNumber of parameters: 125.22432 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters: 125.2213504 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billionNumber of parameters without embeddings: 103.368064 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-10 11:11:05 -[2021-10-10 11:11:05,177] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-10 11:11:05,178] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-10 11:11:05,178] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-10 11:11:05,178] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-10 11:11:05,178] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -Traceback (most recent call last): -Traceback (most recent call last): -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 246, in - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 246, in - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 246, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 246, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 165, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 165, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 165, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 165, in pretrain - iteration = train(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 732, in train - iteration = train(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 732, in train - iteration = train(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 732, in train - iteration = train(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 732, in train - train_step(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 405, in train_step - train_step(forward_step_func,train_step(forward_step_func, - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 405, in train_step - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 405, in train_step - train_step(forward_step_func, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/training.py", line 405, in train_step - loss = model[0].train_batch(data_iter=data_iterator) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 329, in train_batch - loss = model[0].train_batch(data_iter=data_iterator) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 329, in train_batch - loss = model[0].train_batch(data_iter=data_iterator) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 329, in train_batch - loss = model[0].train_batch(data_iter=data_iterator) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 329, in train_batch - self._exec_schedule(sched) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 1313, in _exec_schedule - self._exec_schedule(sched) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 1313, in _exec_schedule - self._exec_schedule(sched) -self._exec_schedule(sched) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 1313, in _exec_schedule - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 1313, in _exec_schedule - self._exec_instr(**cmd.kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 631, in _exec_forward_pass - self._exec_instr(**cmd.kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 631, in _exec_forward_pass - self._exec_instr(**cmd.kwargs) -self._exec_instr(**cmd.kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 631, in _exec_forward_pass - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/engine.py", line 631, in _exec_forward_pass - outputs = super().forward(inputs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/engine.py", line 1321, in forward - outputs = super().forward(inputs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/engine.py", line 1321, in forward - outputs = super().forward(inputs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/engine.py", line 1321, in forward - outputs = super().forward(inputs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/engine.py", line 1321, in forward - loss = self.module(*inputs, **kwargs)loss = self.module(*inputs, **kwargs) - - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - loss = self.module(*inputs, **kwargs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl -loss = self.module(*inputs, **kwargs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 352, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 352, in forward - result = self.forward(*input, **kwargs)result = self.forward(*input, **kwargs) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 352, in forward - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 352, in forward - x = self.activation_checkpoint_func(x = self.activation_checkpoint_func( - - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 743, in checkpoint - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 743, in checkpoint - x = self.activation_checkpoint_func( - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 743, in checkpoint - x = self.activation_checkpoint_func( - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 743, in checkpoint - CheckpointFunction.apply(function, all_outputs, *args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 582, in forward - CheckpointFunction.apply(function, all_outputs, *args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 582, in forward - CheckpointFunction.apply(function, all_outputs, *args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 582, in forward - CheckpointFunction.apply(function, all_outputs, *args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 582, in forward - outputs = run_function(*inputs_cuda) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 330, in exec_func - outputs = run_function(*inputs_cuda) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 330, in exec_func - outputs = run_function(*inputs_cuda) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 330, in exec_func -outputs = run_function(*inputs_cuda) - File "/gpfsssd/worksf/projects/rech/six/commun/code/cutting-edge/deepspeed/deepspeed/runtime/pipe/module.py", line 330, in exec_func - inputs = layer(inputs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - inputs = layer(inputs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - inputs = layer(inputs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl -inputs = layer(inputs) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 588, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 588, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 588, in forward -result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 588, in forward - return super().forward(hidden_states, attention_mask, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 479, in forward - return super().forward(hidden_states, attention_mask, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 479, in forward - return super().forward(hidden_states, attention_mask, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 479, in forward - return super().forward(hidden_states, attention_mask, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 479, in forward - self.self_attention(layernorm_output, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - self.self_attention(layernorm_output, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - self.self_attention(layernorm_output, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - self.self_attention(layernorm_output, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 333, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 333, in forward - result = self.forward(*input, **kwargs) - attention_probs = self.scale_mask_softmax(attention_scores, File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 333, in forward - - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/transformer.py", line 333, in forward -attention_probs = self.scale_mask_softmax(attention_scores, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - attention_probs = self.scale_mask_softmax(attention_scores, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - attention_probs = self.scale_mask_softmax(attention_scores, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 157, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 157, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 157, in forward - result = self.forward(*input, **kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/fused_softmax.py", line 157, in forward - mask_output = self.mask_func(input, mask) if mask is not None else input - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/utils.py", line 43, in attention_mask_func - mask_output = self.mask_func(input, mask) if mask is not None else input - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/utils.py", line 43, in attention_mask_func - mask_output = self.mask_func(input, mask) if mask is not None else input - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/utils.py", line 43, in attention_mask_func -mask_output = self.mask_func(input, mask) if mask is not None else input - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/megatron/model/utils.py", line 43, in attention_mask_func - attention_scores.masked_fill_(attention_mask, -10000.0) -RuntimeError attention_scores.masked_fill_(attention_mask, -10000.0): -The expanded size of the tensor (64) must match the existing size (2048) at non-singleton dimension 3. Target sizes: [1, 20, 64, 64]. Tensor sizes: [1, 1, 2048, 2048] -RuntimeError: The expanded size of the tensor (64) must match the existing size (2048) at non-singleton dimension 3. Target sizes: [1, 20, 64, 64]. Tensor sizes: [1, 1, 2048, 2048] - attention_scores.masked_fill_(attention_mask, -10000.0) -RuntimeError : attention_scores.masked_fill_(attention_mask, -10000.0)The expanded size of the tensor (64) must match the existing size (2048) at non-singleton dimension 3. Target sizes: [1, 20, 64, 64]. Tensor sizes: [1, 1, 2048, 2048] - -RuntimeError: The expanded size of the tensor (64) must match the existing size (2048) at non-singleton dimension 3. Target sizes: [1, 20, 64, 64]. Tensor sizes: [1, 1, 2048, 2048] -Killing subprocess 583904 -Killing subprocess 583905 -Killing subprocess 583906 -Killing subprocess 583908 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1504567.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -srun: error: r7i4n5: task 1: Exited with exit code 1 -srun: Terminating job step 1504567.0 -slurmstepd: error: *** STEP 1504567.0 ON r7i4n4 CANCELLED AT 2021-10-10T11:11:11 *** -Killing subprocess 1309400 -Killing subprocess 582570 -Killing subprocess 1309401 -Killing subprocess 1309402 -Killing subprocess 583515 -Killing subprocess 1309403 -Killing subprocess 580155 -Killing subprocess 582571 -Main process received SIGTERM, exiting -Killing subprocess 583516 -Killing subprocess 580156 -Killing subprocess 580157 -Killing subprocess 580158 -Main process received SIGTERM, exiting -Killing subprocess 582572 -Killing subprocess 582699 -Killing subprocess 582713 -Killing subprocess 788242 -Killing subprocess 582573 -Main process received SIGTERM, exiting -Killing subprocess 583517 -Killing subprocess 583519 -Killing subprocess 788243 -Main process received SIGTERM, exiting -Killing subprocess 788244 -Killing subprocess 582700 -Killing subprocess 788245 -Killing subprocess 582701 -Killing subprocess 582703 -Killing subprocess 582714 -Killing subprocess 582715 -Killing subprocess 582716 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 642660 -Killing subprocess 590935 -Killing subprocess 585946 -Killing subprocess 583066 -Killing subprocess 590936 -Killing subprocess 591423 -Killing subprocess 217428 -Killing subprocess 642661 -Killing subprocess 590937 -Killing subprocess 585947 -Killing subprocess 267185 -Killing subprocess 583067 -Killing subprocess 267817 -Killing subprocess 591424 -Killing subprocess 590939 -Killing subprocess 240661 -Killing subprocess 263198 -Killing subprocess 217429 -Killing subprocess 642662 -Killing subprocess 642663 -Killing subprocess 242067 -Killing subprocess 267186 -Killing subprocess 583068 -Killing subprocess 241026 -Killing subprocess 239636 -Killing subprocess 591425 -Killing subprocess 217430 -Killing subprocess 239936 -Killing subprocess 585948 -Killing subprocess 267187 -Killing subprocess 585949 -Killing subprocess 267818 -Killing subprocess 239560 -Main process received SIGTERM, exiting -Killing subprocess 240181 -Killing subprocess 240264 -Killing subprocess 591426 -Killing subprocess 239842 -Killing subprocess 240662 -Killing subprocess 263199 -Killing subprocess 583069 -Killing subprocess 241027 -Killing subprocess 239069 -Killing subprocess 242068 -Killing subprocess 267819 -Killing subprocess 240663 -Main process received SIGTERM, exiting -Killing subprocess 267189 -Killing subprocess 239937 -Killing subprocess 242069 -Main process received SIGTERM, exiting -Killing subprocess 239637 -Killing subprocess 239938 -Killing subprocess 241028 -Killing subprocess 240265 -Killing subprocess 267821 -Killing subprocess 240664 -Killing subprocess 239561 -Killing subprocess 217431 -Killing subprocess 239843 -Main process received SIGTERM, exiting -Killing subprocess 240182 -Killing subprocess 263200 -Killing subprocess 242071 -Killing subprocess 263201 -Killing subprocess 240816 -Killing subprocess 238499 -Main process received SIGTERM, exiting -Killing subprocess 241030 -Killing subprocess 239070 -Killing subprocess 239844 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 239127 -Killing subprocess 239638 -Killing subprocess 239639 -Killing subprocess 240266 -Killing subprocess 239845 -Killing subprocess 591347 -Main process received SIGTERM, exiting -Killing subprocess 240267 -Killing subprocess 240183 -Killing subprocess 239269 -Killing subprocess 239562 -Killing subprocess 240184 -Killing subprocess 238500 -Main process received SIGTERM, exiting -Killing subprocess 239563 -Main process received SIGTERM, exiting -Killing subprocess 239128 -Killing subprocess 239939 -Killing subprocess 239071 -Main process received SIGTERM, exiting -Killing subprocess 239072 -Main process received SIGTERM, exiting -Killing subprocess 239270 -Killing subprocess 240817 -Killing subprocess 238501 -Killing subprocess 591348 -Killing subprocess 239271 -Main process received SIGTERM, exiting -Killing subprocess 240818 -Killing subprocess 240819 -Killing subprocess 238502 -Killing subprocess 239129 -Main process received SIGTERM, exiting -Killing subprocess 239130 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 239272 -Killing subprocess 591349 -Killing subprocess 591350 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -srun: error: r9i3n3: task 19: Exited with exit code 1 -srun: error: r9i3n0: task 16: Exited with exit code 1 -srun: error: r7i4n6: task 2: Exited with exit code 1 -srun: error: r7i5n6: task 11: Exited with exit code 1 -srun: error: r9i5n3: task 29: Exited with exit code 1 -srun: error: r7i5n2: task 7: Exited with exit code 1 -srun: error: r7i6n0: task 14: Exited with exit code 1 -srun: error: r9i4n0: task 25: Exited with exit code 1 -srun: error: r9i3n8: task 24: Exited with exit code 1 -srun: error: r7i5n8: task 13: Exited with exit code 1 -srun: error: r9i3n5: task 21: Exited with exit code 1 -srun: error: r9i3n7: task 23: Exited with exit code 1 -srun: error: r7i4n7: task 3: Exited with exit code 1 -srun: error: r9i3n1: task 17: Exited with exit code 1 -srun: error: r7i4n4: task 0: Exited with exit code 1 -srun: error: r9i3n4: task 20: Exited with exit code 1 -srun: error: r9i3n6: task 22: Exited with exit code 1 -srun: error: r7i5n5: task 10: Exited with exit code 1 -srun: error: r7i5n4: task 9: Exited with exit code 1 -srun: error: r9i2n8: task 15: Exited with exit code 1 -srun: error: r9i5n5: task 31: Exited with exit code 1 -srun: error: r9i5n2: task 28: Exited with exit code 1 -srun: error: r9i4n2: task 27: Exited with exit code 1 -srun: error: r7i5n7: task 12: Exited with exit code 1 -srun: error: r9i5n4: task 30: Exited with exit code 1 -srun: error: r9i4n1: task 26: Exited with exit code 1 -srun: error: r9i3n2: task 18: Exited with exit code 1 -srun: error: r7i5n0: task 5: Exited with exit code 1 -srun: error: r7i4n8: task 4: Exited with exit code 1 -srun: error: r7i5n3: task 8: Exited with exit code 1 -srun: error: r7i5n1: task 6: Exited with exit code 1 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -JIT compiled ops requires ninja - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] - - ---------------------------------------------------[OKAY]---------------------------------------------------------------------------------------------------- - - - ---------------------------------------------------op nameop name -op name ................op name................ ................ installed................ .. installedcompatibleinstalledinstalled - --------------------------------------------------.... -.. compatible compatible -compatible --------------------------------------------------- -cpu_adam -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- -............... - [YES] ...... [OKAY]cpu_adam --------------------------------------------------- --------------------------------------------------- - ............... cpu_adam[YES]cpu_adam .................................... [OKAY][YES][YES] -fused_adam ......................... [OKAY][OKAY][NO] - -op name op name................ ................installed ..installed compatible.. - --------------------------------------------------compatible -.......fused_adam [OKAY]............. - --------------------------------------------------- - [NO] .......fused_lamb fused_adamfused_adam [OKAY] ............. -cpu_adam ............... [YES]cpu_adam ...... ...............[OKAY] -[YES] ...... [OKAY] -............. ............. [NO] fused_lamb[NO] [NO] ....... ........................... [OKAY] [OKAY][OKAY] -[NO] - -fused_adam ............. [NO]fused_adam .................... [OKAY][NO] - ....... [OKAY]fused_lamb - ....... [OKAY] -fused_lamb .......................... [NO][NO] ....... .......[OKAY] -[OKAY]sparse_attn -fused_lamb ............. fused_lamb[NO] .................... [OKAY][NO] - ............sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] - ....... [OKAY] -sparse_attn sparse_attn............ [NO]............ .......[NO] [OKAY]....... - [OKAY] -sparse_attntransformertransformersparse_attn ............ ........................ ............ [NO] [NO].......[NO][NO] ....... [OKAY].......[OKAY]....... - -transformertransformer ............ ............[NO] [NO]....... .......[OKAY] -[OKAY] - [OKAY][OKAY] - -stochastic_transformer stochastic_transformer. [NO]. ....... [NO][OKAY] -transformerstochastic_transformertransformer ............stochastic_transformer............ . [NO].[NO] .......[NO][NO]....... ....... [OKAY] -....... [OKAY] -[OKAY].......[OKAY] - -[OKAY]stochastic_transformer - .stochastic_transformer [NO] ........ [NO][OKAY] -....... [OKAY] -ninjaninjaninjaninja .................................... .................. ..................[OKAY] [OKAY] -[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op name --------------------------------------------------- op name -................op name ................ op nameinstalled ................ installed ................installed.. .. .. installedcompatible compatiblecompatible -.. - - ----------------------------------------------------------------------------------------------------compatible --------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... cpu_adam[YES]cpu_adam cpu_adam............... ...... ............... [YES] ............... [YES] [OKAY]......[YES] - ......[OKAY]...... -[OKAY] -[OKAY] -fused_adam ............. [NO] fused_adamfused_adam....... ............. .............fused_adam [OKAY] [NO] -[NO]............. ....... fused_lamb....... [NO] [OKAY].............[OKAY] -....... - [NO][OKAY]fused_lamb - fused_lamb ....... .............fused_lamb............. [NO][OKAY]............. [NO] -....... .......[NO][OKAY] -[OKAY]....... - [OKAY] -sparse_attn ............ [NO] ....... sparse_attn[OKAY]sparse_attn - ............sparse_attn............ transformer [NO][NO]........................ ..............[NO] [OKAY][NO] - [OKAY].............. - transformer [OKAY] -............transformer[OKAY] ............ -[NO]stochastic_transformer [NO]transformer....... ....... .[OKAY] ............ -[OKAY] [NO] -[NO] .......stochastic_transformer ....... stochastic_transformer[OKAY] . -[OKAY] . - [NO][NO] .............. stochastic_transformer [OKAY] [OKAY] - -. [NO] ....... [OKAY] -ninjaninja .................................... [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- -op nameop name ................ ................installed ..installed compatible.. - --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam cpu_adam............... ...............[YES] [YES]...... [OKAY]...... - [OKAY] -fused_adam ............. [NO] fused_adam....... .............[OKAY] -[NO] ....... [OKAY]fused_lamb - ............. [NO]fused_lamb .................... [OKAY] -[NO] ....... [OKAY] -sparse_attn ............ [NO]sparse_attn ................... [OKAY][NO] - .......transformer [OKAY]............ - [NO] ....... [OKAY]transformer - ............ [NO] stochastic_transformer....... [OKAY]. - [NO] stochastic_transformer....... [OKAY] -. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja -JIT compiled ops requires ninja --------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - -[OKAY] - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop name ................................op name................ installed installed ................ installed.... compatibleinstalled..compatible - - ..----------------------------------------------------------------------------------------------------compatible - - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam .............................. [YES][YES]cpu_adamcpu_adam ...... .................................... [OKAY][OKAY] - -[YES][YES] ............ [OKAY] -[OKAY]fused_adam -fused_adam .......................... [NO][NO] ....... .......[OKAY] fused_adam -[OKAY] fused_adam -............. .............[NO]fused_lamb fused_lamb[NO] .................... ............. [OKAY][NO][NO] -....... ....... ....... [OKAY]fused_lamb -[OKAY] [OKAY] -............. - [NO]fused_lamb .................... [OKAY] -[NO] ....... [OKAY]sparse_attnsparse_attn - ........................ [NO] [NO]....... .......sparse_attn[OKAY] -[OKAY]............ - [NO]transformer sparse_attn transformer....... ............ ........................ [OKAY] [NO] -[NO][NO] ..............transformer [OKAY] -[OKAY]................... - [NO]stochastic_transformer[OKAY] stochastic_transformer -......... [OKAY][NO][NO] - transformer.............. stochastic_transformer [OKAY] [OKAY]............ - - . [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer . [NO] ....... [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja--------------------------------------------------JIT compiled ops requires ninja - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ------------------------------------------------------------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- -JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - - -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report --------------------------------------------------- - - -JIT compiled ops requires ninja----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY] -[OKAY][OKAY][OKAY]-------------------------------------------------- - - - ---------------------------------------------------op name---------------------------------------------------------------------------------------------------- - - -................op nameop name op name installed................ ................................ .. installed installedinstalled compatible .. -.... -------------------------------------------------- compatible -compatible -compatible - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [YES] ...... [OKAY]cpu_adamcpu_adam - cpu_adam............... ............... [YES] ............... [YES] ......[YES]fused_adam ............[OKAY]............. -[OKAY][OKAY][NO] - -....... [OKAY] -fused_lamb fused_adam............. fused_adam.............[NO]fused_adam .................................[NO] [NO] .......[OKAY] [NO]....... - [OKAY] .......[OKAY] - -[OKAY] -fused_lamb fused_lamb.............fused_lamb sparse_attn ............. [NO]......................... [NO].......[NO] [NO] [OKAY].............. ....... -[OKAY][OKAY] - -[OKAY] -transformer ............ [NO] ....... [OKAY]sparse_attn - sparse_attn............ stochastic_transformer[NO] .............sparse_attn....... [NO] [NO][OKAY] ............ - .......[NO]....... transformer .......[OKAY][OKAY] - -............[OKAY] -transformer[NO] transformer................... ............[NO] [OKAY] [NO] -....... .......[OKAY] stochastic_transformer -[OKAY] -stochastic_transformer .stochastic_transformer .[NO] [NO]........ .......[OKAY][NO] - [OKAY]....... - [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop name op nameop name ................ ................ ................installed................ installedinstalledinstalled.. ...... compatible compatible -compatiblecompatible --------------------------------------------------- - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [YES]cpu_adamcpu_adam cpu_adam ............... ..................... [YES]...............[OKAY][YES] - ......[YES]...... [OKAY]......[OKAY] - - [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_adamfused_adamfused_adam fused_lamb............. ............. ............. [NO] .............[NO] [NO]....... ....... [NO] .......[OKAY][OKAY] - -[OKAY]....... - [OKAY]fused_lambfused_lamb - fused_lamb .......................... .............[NO][NO] .......[NO]....... [OKAY]....... -sparse_attn [OKAY] [OKAY] -............ - [NO] ....... [OKAY] -transformer ............sparse_attn [NO]............sparse_attnsparse_attn [NO]................... ............ ....... [NO][OKAY] [NO] -[OKAY] ....... -....... [OKAY][OKAY]transformer - - stochastic_transformer............ transformer transformer [NO] . ........................ ....... [NO][NO] [NO] [OKAY]....... ....... - ....... [OKAY] [OKAY] -[OKAY] - -stochastic_transformer stochastic_transformerstochastic_transformer. .[NO]. .......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -ninjaninjaninjaninja .................. .................. [OKAY]....................................[OKAY] - - --------------------------------------------------[OKAY]-------------------------------------------------- -[OKAY] - -op name --------------------------------------------------- op name--------------------------------------------------................ - -................installedop name op name installed.................................. .. compatibleinstalledinstalled - compatible --------------------------------------------------.. -.. - -------------------------------------------------- compatible -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY][OKAY] -compatible - ----------------------------------------------------------------------------------------------------- -cpu_adam - ............... [YES]cpu_adam ..................... [OKAY][YES]cpu_adamcpu_adam - .................................... [OKAY][YES][YES] -[OKAY] --------------------------------------------------- - - --------------------------------------------------- ---------------------------------------------------op name-------------------------------------------------- - -op name................op name op nameinstalled................................ ................installed.. installed installed ..compatible.. -.. --------------------------------------------------compatiblecompatiblecompatible - - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ............ fused_adam[OKAY] [OKAY] -............. - [NO] fused_adam....... .............[OKAY] -[NO] ....... fused_adamfused_lamb[OKAY]fused_adam -cpu_adam cpu_adamcpu_adam...............cpu_adam .............................................[YES] ...... [YES][YES][OKAY][YES] - ............. .............[NO].............fused_lamb [NO] .......[NO] ............. ..............[OKAY][NO] -[OKAY].......[OKAY] - -[OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - - .................. [OKAY][OKAY][OKAY] - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -fused_adam ............. [NO] ....... fused_adamfused_adam[OKAY]fused_adam -fused_lambfused_lamb .......................... [NO][NO] sparse_attn.............. sparse_attn............[OKAY][OKAY] - -op nameop nameop nameop name ................................................................ installedinstalledinstalledinstalled ...... .. compatiblecompatible compatible - - -compatible---------------------------------------------------------------------------------------------------- - --------------------------------------------------- - --------------------------------------------------- - .......................................fused_lamb [NO][NO]............. [NO] ..............[NO] .............. [OKAY][OKAY] -[NO]............ .......[NO] [OKAY]....... - [OKAY] - [OKAY] -[OKAY] - -fused_lambfused_lamb .............fused_lamb............. [NO].............[NO] .......[NO]....... sparse_attn [OKAY].......[OKAY]............ - -transformer ............transformer sparse_attn [NO]sparse_attn ............ ................... ............[NO] [NO][OKAY] .......[NO] - ....... [OKAY] -.......[OKAY] stochastic_transformer -cpu_adamcpu_adam cpu_adam ...............cpu_adam .............................. ...............[YES][YES][YES] [YES]............ ...... [OKAY]...... -stochastic_transformer[OKAY] transformer -[OKAY][OKAY] - -[OKAY] - [NO][OKAY] -....... [OKAY] -. . transformer............ [NO] [NO] ............ .......[NO] .......[NO][OKAY] -fused_adam ............. fused_adam[NO] .............fused_adamfused_adam....... [OKAY]..........................[NO] -transformer ............sparse_attn sparse_attn[NO] ............ .......sparse_attn[NO]............ [OKAY] ............ -.......[NO] [NO][OKAY].......stochastic_transformer -.......[OKAY]....... - [OKAY][OKAY] - - [NO][NO]....... fused_lamb .............. [OKAY][OKAY][OKAY] -............. - - .......[OKAY] -[OKAY].transformer -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY] -[OKAY] - [NO] fused_lamb.......fused_lamb fused_lamb [OKAY]....................................... - transformer[NO]............transformer ...................[NO]............ [NO] [OKAY] ....... - [NO][NO][NO] ..................... [OKAY] [OKAY] -[OKAY] - -[NO] ....... [OKAY] ....... -[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer. . [NO].[NO] .......[NO]....... [OKAY] -.......[OKAY] -[OKAY] -transformersparse_attnsparse_attnsparse_attn ............ ............ ........................ [NO] [NO][NO] [NO] ....... .............. ....... [OKAY] [OKAY][OKAY] - -[OKAY] - -transformertransformerstochastic_transformer transformer ............ ............ .............[NO][NO] [NO].............. [NO] [OKAY].......[OKAY] - - .......[OKAY] -stochastic_transformer[OKAY] stochastic_transformer - . .[NO] stochastic_transformer....... [NO][OKAY]. -....... [NO][OKAY] -....... [OKAY] -ninjaninjaninjaninja ...................................................... [OKAY] .................. -[OKAY][OKAY]-------------------------------------------------- - - -[OKAY]op name---------------------------------------------------------------------------------------------------- - - -................--------------------------------------------------op name op name installed - ................ ..................op nameinstalled compatible -..installed................ -------------------------------------------------- compatible -..installed - --------------------------------------------------compatible.. - - compatible-------------------------------------------------- - -cpu_adam-------------------------------------------------- -............... [YES]cpu_adam .....................cpu_adam [OKAY] [YES] -cpu_adam ..................... ...............[YES][OKAY] -......[YES] [OKAY]......fused_adam - .............[OKAY] [NO] - .......fused_adam [OKAY]............. - [NO]fused_adam ....... fused_lamb .............[OKAY]fused_adam............. - [NO][NO]............. fused_lamb ....... .......[NO]............. [OKAY][OKAY][NO]....... - - .......[OKAY] [OKAY] -fused_lamb - ............. fused_lamb[NO] .................... sparse_attn[NO][OKAY] -................... sparse_attn[NO][OKAY] ............ -....... [NO][OKAY] -....... [OKAY] -transformer ............sparse_attn transformer [NO] ............sparse_attn ............ ....... [NO][NO] ............ [OKAY]....... -....... [NO] [OKAY] [OKAY]stochastic_transformer -....... - [OKAY].transformer -stochastic_transformer [NO]transformer............. ...................[NO] [NO] [OKAY].............. -[NO] [OKAY][OKAY] ....... - - [OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -------------------------------------------------------------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. ..................[OKAY][OKAY][OKAY] - -[OKAY] ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op nameop name - op name................ ................op name ................ installed installed................ installed .. .... installed compatiblecompatible -..compatible - ---------------------------------------------------------------------------------------------------- -compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adamcpu_adam cpu_adam...............cpu_adam ...............[YES].............................. [YES][YES] ...... [YES] [OKAY]............ - ......[OKAY] -[OKAY][OKAY] - -fused_adam ............. [NO]fused_adamfused_adam fused_adam ....... ....................................... [NO] [OKAY][NO] [NO] -....... .......[OKAY].......fused_lamb - [OKAY][OKAY] -............. -fused_lamb [NO]............. fused_lamb .......[NO] fused_lamb ............. [OKAY] ....... - .............[NO][OKAY] -[NO]....... [OKAY]....... - [OKAY] -sparse_attn ............ [NO] sparse_attn....... ............[OKAY] -[NO]sparse_attn transformersparse_attn................... ........................[NO][OKAY] [NO] - .......[NO]....... transformer [OKAY].......[OKAY]............ - - [OKAY][NO]transformer - stochastic_transformer................... transformer [OKAY] .[NO] -............ [NO].......stochastic_transformer[NO] ....... ....... [OKAY].[OKAY][OKAY] - - -[NO] ....... stochastic_transformer[OKAY]stochastic_transformer - . .[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- -DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................. [OKAY].................. [OKAY] - [OKAY] -[OKAY]-------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op name-------------------------------------------------- -op name - op name................................op name ................installedinstalled................ installed.. .. installed ..compatiblecompatible - -..compatible---------------------------------------------------------------------------------------------------- - - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam.............................. cpu_adam [YES][YES]............... ...... ............... ...... [OKAY][YES][OKAY][YES] - - ............ [OKAY][OKAY] - -fused_adam fused_adam............. .............[NO] fused_adamfused_adam [NO] .................... .................... [OKAY][NO] -[OKAY][NO] - fused_lamb.............. [OKAY][OKAY]fused_lamb............. - - fused_lamb fused_lamb.............[NO]............. [NO] ............. [NO]....... ....... [NO] ....... [OKAY][OKAY]....... - - [OKAY][OKAY] - -sparse_attnsparse_attn ............ sparse_attn sparse_attn............[NO] ............ ............[NO] ....... [NO] ....... [NO].......[OKAY][OKAY] ....... - -[OKAY] [OKAY]transformer - - transformer............ transformertransformer ............ [NO] ........................ [NO] ....... [NO][NO]....... [OKAY] .......[OKAY] -....... - [OKAY][OKAY] - -stochastic_transformerstochastic_transformer stochastic_transformerstochastic_transformer. . [NO]..[NO] .......[NO][NO] ....... [OKAY] .............. -[OKAY] - [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [YES] [YES]...... ......[OKAY] -[OKAY] -fused_adam fused_adam............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_lamb fused_lamb............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attnsparse_attn ........................ [NO][NO] ....... .......[OKAY] -[OKAY] -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. ..................[OKAY][OKAY][OKAY] - -[OKAY] --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -op nameop name - op name op name................ ................................ installed................installedinstalled .... ..installed compatible compatiblecompatible - -.. --------------------------------------------------- -------------------------------------------------- ---------------------------------------------------compatible - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam.............................. cpu_adam ............... [YES][YES] ............... [YES] ......[YES]...... [OKAY]......[OKAY]...... - -[OKAY] -[OKAY] -fused_adamfused_adam .............fused_adamfused_adam............. [NO].............[NO]............. [NO]..............[NO] .......[OKAY][OKAY]....... - - [OKAY][OKAY] - -fused_lamb fused_lambfused_lambfused_lamb ............. ............. .......................... [NO] [NO] [NO][NO] ....... ....... .............. [OKAY] [OKAY] -[OKAY][OKAY] - - -sparse_attnsparse_attnsparse_attnsparse_attn ........................ ........................ [NO][NO] [NO] [NO] ....... .............. ....... [OKAY][OKAY] - -[OKAY][OKAY] -transformer -transformer transformer............transformer............ ............[NO] ............ [NO] [NO] .......[NO] ....... ....... ....... [OKAY][OKAY][OKAY] - -[OKAY] - -stochastic_transformerstochastic_transformer stochastic_transformerstochastic_transformer .. .[NO].[NO] [NO]..............[NO] ..............[OKAY][OKAY] - -[OKAY][OKAY] - -ninjaninjaninja ninja.................. .................. ..................[OKAY][OKAY].................. - - [OKAY]--------------------------------------------------[OKAY]-------------------------------------------------- - - - -op name--------------------------------------------------op name-------------------------------------------------- - - ................op nameop name ................ installed ................ ................ installed..installedinstalled ....compatible.. compatible - -compatiblecompatible---------------------------------------------------------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adamcpu_adamcpu_adam ............... ............... ............... [YES]............... [YES] [YES] [YES]............ ......[OKAY] [OKAY] -[OKAY]...... - - [OKAY] -fused_adam .............fused_adamfused_adam fused_adam.............[NO]............. .............[NO][NO]....... [NO].......[OKAY] ....... -[OKAY]....... - [OKAY] -[OKAY]fused_lamb -fused_lamb fused_lamb............. fused_lamb[NO].......................... ....................[NO][NO] [OKAY][NO]....... -....... [OKAY].......[OKAY] - -[OKAY] -sparse_attn ............ [NO]sparse_attn .......sparse_attn............ sparse_attn[OKAY] ........................[NO] - transformer[NO][NO]....... ..........................[OKAY] [OKAY] -[NO][OKAY] - .......transformer -transformer [OKAY]............transformer............ - [NO] [NO] ............ stochastic_transformer....... ....... [NO][OKAY].[OKAY] - - [NO]....... .......stochastic_transformerstochastic_transformer[OKAY] [OKAY] - -. .[NO] stochastic_transformer [NO] ....... ....... .[OKAY][OKAY] - -[NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- -JIT compiled ops requires ninja - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY] -[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op nameop name - op name................ op name................ ................ installed................installedinstalled ..installed.. .. compatible compatible -..compatible --------------------------------------------------- - --------------------------------------------------- -------------------------------------------------- -compatible - --------------------------------------------------- -cpu_adam ............... [YES]cpu_adam cpu_adam.....................cpu_adam [YES][OKAY] ............... - ............... ...... [YES] [OKAY][YES]...... - [OKAY]......fused_adam - .............[OKAY] -[NO] .......fused_adam [OKAY]............. - [NO]fused_adam ....................fused_lamb fused_adam[OKAY].............[NO] -....................[NO] fused_lamb.......[OKAY][NO] - .............[OKAY]....... -[NO] fused_lamb [OKAY]....... - .............[OKAY] -[NO]fused_lamb .......sparse_attn............. [OKAY]............ - [NO][NO] ..............sparse_attn [OKAY][OKAY]............ - - sparse_attn[NO] transformer ............ ....... ............ [NO] [OKAY] [NO] -....... .......[OKAY] transformer -[OKAY]sparse_attn -transformer............ ........................[NO]stochastic_transformer [NO][NO]....... . .......[OKAY] [NO] - [OKAY].............. -stochastic_transformer [OKAY][OKAY]. stochastic_transformer - -[NO] ........ [OKAY]transformer[NO] - ................... [OKAY][NO] - ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - - ---------------------------------------------------JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop nameop name ................................................................ installed installedinstalled installed .. .. .... compatible compatiblecompatible -compatible - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -cpu_adamcpu_adam cpu_adam ............... ...............cpu_adam ...............[YES] [YES][YES]............... ...... ...... ......[YES] [OKAY] [OKAY] -[OKAY] -...... - [OKAY] -fused_adamfused_adamfused_adam .......................................fused_adam [NO][NO][NO] ............. .............. ....... [NO] [OKAY][OKAY][OKAY] - -....... - [OKAY]fused_lamb -fused_lamb .............fused_lamb............. [NO]fused_lamb.............[NO] ....................[NO]....... .......[OKAY][NO][OKAY] - -[OKAY]....... - [OKAY] -sparse_attnsparse_attn sparse_attn ........................ ............[NO]sparse_attn[NO] ....... [NO]....... ............ [OKAY] .......[OKAY] -[NO] -[OKAY] -.......transformer transformer transformer............[OKAY] - ............[NO]............ [NO].......[NO]transformer [OKAY].......................... - [OKAY][OKAY][NO] - -stochastic_transformer ....... [OKAY].stochastic_transformerstochastic_transformer - [NO] .........stochastic_transformer [NO][NO][OKAY] - ............... [OKAY][NO][OKAY] - -....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op name -op name op name ................op name ................ ................installedinstalled................ ..installed.. installed compatible....compatible - ---------------------------------------------------compatible-------------------------------------------------- -compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam............... cpu_adam...............cpu_adam[YES] [YES].................................... ......[OKAY][YES] [YES][OKAY] - -............ [OKAY][OKAY] - -fused_adamfused_adam .......................... fused_adam [NO][NO]fused_adam............. .................... ....... [NO][NO] [OKAY][OKAY] -....... -....... fused_lamb[OKAY][OKAY] -fused_lamb............. - .............fused_lamb[NO] fused_lamb [NO]............. ........................... [OKAY] [NO][NO] - [OKAY] ....... -....... [OKAY][OKAY] - -sparse_attn ............ [NO] sparse_attn....... sparse_attn............sparse_attn[OKAY] - [NO]........................ transformer ....... [NO][OKAY][NO]............ - .......[NO]....... transformer [OKAY] .......[OKAY] -............ - [OKAY]transformer[NO] -transformer ............................... stochastic_transformer [NO][NO][OKAY] - ............... [OKAY][NO][OKAY]stochastic_transformer - ....... - [OKAY] -stochastic_transformer. [NO]stochastic_transformer . ....... .[NO][OKAY] - [NO]....... .......[OKAY] -[OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatibleninja - --------------------------------------------------.................. -[OKAY] --------------------------------------------------- -op name ................ installed ..cpu_adam compatible -...............-------------------------------------------------- -[YES] ...... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -fused_adam fused_adam............. ............. [NO][NO] .............. [OKAY] -[OKAY] -fused_lamb ............. [NO] fused_lamb....... [OKAY]............. - [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attntransformer ........................ [NO] [NO]....... .......[OKAY] - [OKAY] -stochastic_transformertransformer ............. [NO][NO] ....... [OKAY]....... - [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY][OKAY] - -[OKAY] --------------------------------------------------- - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - - op nameop name................ op name installed................................ ................ .. installed installedinstalled compatible.. .. - .. --------------------------------------------------compatible -compatiblecompatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [YES] ...... cpu_adam[OKAY]cpu_adam -cpu_adam ............... ............... ............... [YES] [YES] [YES] ...... fused_adam...... ...... [OKAY].............[OKAY] - -[OKAY] -[NO] ....... [OKAY] -fused_adamfused_lamb ..........................fused_adam fused_adam [NO][NO] ............. .................... ....... [NO][OKAY][NO] - [OKAY].............. - [OKAY][OKAY] - -fused_lamb ............. [NO]fused_lambsparse_attnfused_lamb ............................................. [OKAY][NO] [NO] - [NO] ..................... [OKAY][OKAY][OKAY] - - -transformer sparse_attn............ [NO]............ .......[NO] sparse_attnsparse_attn[OKAY] -....... ........................ stochastic_transformer [OKAY][NO] [NO] - ............... transformer [OKAY][NO][OKAY]............ - -....... transformer [NO] transformer[OKAY] -............................... [NO][OKAY][NO] -.............. [OKAY][OKAY]stochastic_transformer - - . [NO]stochastic_transformer stochastic_transformer ....... .[OKAY]. -[NO] [NO]....... .......[OKAY] - [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -JIT compiled ops requires ninja----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report------------------------------------------------------------------------------------------------------------------------------------------------------ - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja --------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................................... [OKAY] [OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -op nameop nameop name op name................ ................ ................................ installed installed installedinstalled .. .. .... compatible compatible -compatiblecompatible - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -cpu_adam ...............cpu_adamcpu_adamcpu_adam [YES]............................................. ...... [YES][YES] [YES] [OKAY] .................. - [OKAY][OKAY][OKAY] - - -[OKAY] - ----------------------------------------------------------------------------------------------------- - -----------------------------------------------------------------------------------------------------op nameop name - - ................op nameop name................ installed installed................ ................ ....installed installed compatible compatible -op nameop nameop name op name................................................ ................installedinstalledinstalled ..installed.. .. compatiblecompatible.. - - --------------------------------------------------compatible--------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- - -fused_adam ............. [NO]fused_adamfused_adam fused_adam ....... ............. .......................... [OKAY] [NO] -.... - --------------------------------------------------compatible--------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [YES]cpu_adam[YES] cpu_adam........................... ...............[OKAY][YES][OKAY] - -[NO][NO] ..................... [OKAY][OKAY][OKAY]fused_lamb - - - ............. [NO]fused_lamb fused_lambfused_lamb ....... ............. .......................... [OKAY] [NO] -cpu_adam ............... [YES]cpu_adam cpu_adam ......cpu_adam............... [YES]...............[OKAY] ............... ...... -[YES]...... ......[OKAY] -[OKAY] -[NO][NO] ..................... [OKAY][OKAY][OKAY] - - - [YES][YES][OKAY] -............ [OKAY][OKAY] -fused_adam fused_adam............. .............[NO] [NO]....... fused_adam fused_adam....... [OKAY] ............. -sparse_attn ............ [NO] .......sparse_attn sparse_attn [OKAY] ............sparse_attn - -.............[OKAY] -............ [NO]............[NO]transformer [NO].......................... [NO].......[OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_adam ............. fused_adamfused_adam[NO]fused_lamb ................................. .............[OKAY] - [NO][NO]fused_lamb ...........................fused_lamb [NO][OKAY].............[OKAY] - ....... -.......[OKAY] -[OKAY]transformertransformer - ........................ transformer[NO][NO] stochastic_transformer ............ .............. .[NO][OKAY][OKAY] - [NO][NO][NO] fused_lamb..................... [OKAY] .............[OKAY] -[OKAY] - -[NO] [OKAY]....... -fused_lamb fused_lamb[OKAY] - -[NO]....... .......[OKAY] stochastic_transformer -[OKAY] -[NO] fused_lamb....... fused_lamb[OKAY]............. - .............[NO] [NO]....... .......[OKAY] sparse_attn -.......................... [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer . stochastic_transformer[NO] . .......[NO]. [OKAY].......[NO] -[OKAY] -............ [NO] .......sparse_attn [OKAY]............ -sparse_attn ............ [NO] ....... sparse_attn[OKAY] -............ [NO] .......transformer sparse_attn [OKAY]sparse_attn............ - [OKAY]....... - [OKAY] - transformer[NO] ...................sparse_attn [NO][OKAY]sparse_attn ............ - ............[NO]............ transformer .......[NO] [NO] ............ [OKAY][NO]....... -............ ....... [NO]transformer [NO]...................[OKAY] -.......[NO][OKAY] -[OKAY]stochastic_transformer....... -....... .......[OKAY][OKAY]stochastic_transformer - -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - -[OKAY] - transformer.transformer[OKAY] - [OKAY] --------------------------------------------------- --------------------------------------------------- -op nameop name - op name................op name................ ................installedinstalled................ installed .. ..installed .. compatible compatible.. -compatible -------------------------------------------------- - -compatible ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- --------------------------------------------------- - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - -ninjaninjaninja ninja.................. .................. .................. ..................[OKAY][OKAY][OKAY] - -[OKAY] -[NO]........................ .......[NO]stochastic_transformer [NO] [OKAY] ............... -.transformer transformer [NO]stochastic_transformer ......................... ....... [NO][NO][NO][OKAY] -cpu_adamcpu_adam ...............cpu_adam...............cpu_adam [YES].............................. [YES] ...... [YES] [YES] ......[OKAY] ...... -...... [OKAY] [OKAY] -[OKAY] - -................op nameop nameop name ................installed................................ ..installedinstalledinstalled compatible.... .. -compatible --------------------------------------------------compatible -compatible - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op nameop nameop name - [OKAY][NO] -[OKAY] - ..................... [OKAY][OKAY] -[OKAY] - --------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- - ................ op name ................................installed installedinstalled.................. ..compatibleinstalled.. - compatible--------------------------------------------------compatible.. - - - --------------------------------------------------compatible-------------------------------------------------- - - -....... [OKAY]stochastic_transformer - stochastic_transformer . .[NO] [NO]....... .......[OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] -fused_adam fused_adam............. fused_adam.............[NO]fused_adam ....................[NO]............. [OKAY][NO] -cpu_adam ...............cpu_adam [YES]............... cpu_adam cpu_adam[YES] ..................... ..................... [OKAY] [YES] --------------------------------------------------- -[OKAY] -[NO]....... .......fused_lamb.......[OKAY] -[OKAY].............[OKAY] - -[YES][OKAY] -............ [OKAY][OKAY] - -cpu_adam cpu_adam...............cpu_adam cpu_adam...............[YES]............... ...............[YES] ...... [YES] ......[YES] [OKAY] ...... -[OKAY]...... -[NO]fused_lamb fused_lamb.................... fused_lamb .............[OKAY] [NO][NO] -fused_adam ............. [NO] fused_adam....... .............[OKAY] - [OKAY][OKAY] - -............. ..............[NO] [OKAY] [OKAY] -....... - [OKAY] -[NO]fused_adamfused_adam fused_lamb.................... ............. .............[OKAY][NO] [NO] - [NO].............. [OKAY]....... -fused_adam .............fused_adam fused_adamfused_adam[NO]............. ............. [NO].............[NO]....... .......[NO][OKAY]....... -sparse_attn ............ [NO] ....... [OKAY] -fused_lamb[OKAY] -[OKAY]............. - [OKAY].......[OKAY] - -fused_lamb[OKAY] -sparse_attnsparse_attn sparse_attntransformer ............ ............ ........................ [NO][NO] [NO][NO] ....... ..................... [OKAY] [OKAY][OKAY] -[OKAY] - [NO] .......fused_lamb fused_lamb[OKAY] sparse_attn -.............fused_lambfused_lamb [NO]fused_lamb ............. .................... ............. [NO] [NO] [OKAY][NO] ....... -....... [OKAY].......[OKAY] - -[OKAY] - - -............. .........................[NO] [NO][NO]....... ..............sparse_attn[OKAY] - [OKAY]............[OKAY] - -transformertransformertransformer ........................stochastic_transformer............ [NO][NO][NO] . .............. ....... [OKAY] - [NO][OKAY][OKAY] - -....... [OKAY]stochastic_transformerstochastic_transformer -[NO]transformer ................... [OKAY][NO] -sparse_attn ............ sparse_attn[NO]sparse_attnsparse_attn ............................... ............[NO][OKAY] - stochastic_transformer ... [NO] [NO] [NO].............. .......[OKAY] -[OKAY][OKAY] - - .......sparse_attn transformer[OKAY] ............ -[NO][NO]....... .......transformer[OKAY]....... -............ [OKAY] [OKAY] -............ sparse_attn[NO][NO] stochastic_transformer .......................... . [NO] [OKAY][OKAY] [NO] - -[NO]transformer - transformer ....... ............ transformer............ [OKAY] [NO] -....... .......stochastic_transformertransformer [OKAY] [OKAY] -............. -[NO]............ ..............[NO] [OKAY][OKAY]stochastic_transformer....... - transformer [NO][NO]............ ..............[NO] [OKAY][OKAY]....... - - - [OKAY]. - [OKAY] - [NO] stochastic_transformerstochastic_transformer.......stochastic_transformer [OKAY]. -. . [NO] [NO] [NO] ....... ....... ....... [OKAY] [OKAY] -stochastic_transformer .stochastic_transformer [NO] ........ [OKAY] -[NO] ....... [OKAY] -[OKAY] - -ninjaninja .................................... [OKAY] -[OKAY] --------------------------------------------------- ---------------------------------------------------op name - ................ op nameinstalled .................. compatibleinstalled - --------------------------------------------------.. - compatible --------------------------------------------------- -cpu_adam ............... [YES]cpu_adam ...... ...............[OKAY] -[YES] ...... [OKAY] -fused_adam ............. [NO] fused_adam....... [OKAY]............. - [NO] ....... [OKAY] -fused_lamb ............. fused_lamb[NO] .................... [OKAY] -[NO] ....... [OKAY] -sparse_attn ............ sparse_attn[NO] ................... [OKAY][NO] - ....... [OKAY]transformer - ............ [NO]transformer ................... [OKAY] -[NO] ....... [OKAY]stochastic_transformer - . stochastic_transformer[NO] ....... .[OKAY] - [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -ninja .................. [OKAY] -fused_adam-------------------------------------------------- -............. op name[NO] ....................... installed [OKAY].. - compatible ---------------------------------------------------fused_lamb - ............. [NO] ....... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -fused_adam transformer............. ............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_lamb stochastic_transformer............. [NO] ........ [NO][OKAY] -....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -------------------------------------------------------------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name - op nameop name................ op name ................................ installed installed................ installed ..installed .. .. compatible compatible -..compatible - ---------------------------------------------------compatible---------------------------------------------------------------------------------------------------- - - - --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam cpu_adam ............... [YES]............... ............... [YES] ......[YES] ......[YES]......[OKAY] -[OKAY]......[OKAY] - -[OKAY] -fused_adamfused_adam fused_adam.......................... fused_adam.............[NO] [NO][NO].................... .......[NO]....... [OKAY] -[OKAY].......[OKAY] - - fused_lamb[OKAY] fused_lamb -.............fused_lamb fused_lamb[NO].......................... ............. [NO][NO] ....... .......[NO].......[OKAY] .......[OKAY] - -[OKAY][OKAY] - -sparse_attn ............sparse_attnsparse_attnsparse_attn ............[NO]........................ [NO].......[NO] [NO] [OKAY] ..................... [OKAY] - [OKAY] -[OKAY] - -transformertransformertransformer ............ transformer............ ............ [NO]............[NO][NO] ....... [NO].............. [OKAY][OKAY] [OKAY] - -....... - [OKAY] -stochastic_transformerstochastic_transformerstochastic_transformer stochastic_transformer.. . [NO].[NO][NO] ....... [NO] .............. [OKAY] .......[OKAY] -[OKAY] -[OKAY] - -ninjaninjaninjaninja .................. ...................................................... [OKAY] [OKAY][OKAY] -[OKAY] - --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- -op name --------------------------------------------------- op name -................ op name ................op name installed................installed................ .. .. installedinstalled compatible.. - compatible-------------------------------------------------- .. - - compatible--------------------------------------------------compatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [YES]cpu_adam ..................... [OKAY][YES]cpu_adam cpu_adam - ...... ...............[OKAY]............... - [YES][YES] ............ [OKAY]fused_adam[OKAY] - -............. [NO] .......fused_adam [OKAY]............. - [NO] .......fused_adamfused_lambfused_adam [OKAY].......................... -............. [NO][NO][NO] fused_lamb....... ....... ....................[OKAY] -[NO][OKAY][OKAY] - -.......fused_lamb [OKAY]............. -fused_lamb [NO]............. ....... [OKAY] -sparse_attn [NO]............ [NO].......sparse_attn ................... [OKAY] [OKAY]sparse_attn[NO] - - .......transformer............ [OKAY]............[NO] - [NO] .......sparse_attn.......transformer [OKAY][OKAY]............ -............ - stochastic_transformer[NO]transformer [NO] ........................... [OKAY] - [NO][NO] stochastic_transformer.......[OKAY] -.......[OKAY]. - [OKAY]transformer[NO] - ................... stochastic_transformer [OKAY] -[NO] ........ [NO][OKAY] -....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] --------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op name - --------------------------------------------------- op nameop name -................ ................ ................op nameinstalled installed................installed .. .... compatiblecompatiblecompatibleinstalled - - - --------------------------------------------------..---------------------------------------------------------------------------------------------------- - - -compatible --------------------------------------------------- -cpu_adamcpu_adam cpu_adam .............................. [YES][YES] ............... ...... cpu_adam ......[YES] [OKAY] -......[OKAY] ............... -[OKAY] -[YES] ...... [OKAY] -fused_adam ............. [NO]fused_adam .......fused_adam............. [OKAY] -[NO]............. .......[NO]fused_lamb [OKAY].............fused_adam....... - [NO] [OKAY].......fused_lamb............. [OKAY] - -[NO]............. [NO]fused_lamb ........................... [NO][OKAY][OKAY] - -.......sparse_attn [OKAY]............ - fused_lamb[NO] .................... [NO][OKAY] -sparse_attnsparse_attn....... transformer ............ ........................[OKAY] [NO][NO][NO] - ..................... [OKAY][OKAY][OKAY] - - -transformertransformer ............stochastic_transformer ............ [NO]sparse_attn.[NO] ...................[NO]....... .......[OKAY][NO] [OKAY][OKAY] - - -.......stochastic_transformerstochastic_transformer [OKAY]. - .[NO] transformer[NO]....... ................... [OKAY] [NO] -[OKAY] -....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name op name................ ................ ................installed................installed installedinstalled.. .. ..compatible..compatible - -compatible-------------------------------------------------- -compatible-------------------------------------------------- - --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... cpu_adamcpu_adam[YES]cpu_adam ................................................... [YES] [OKAY][YES][YES]...... ...... - ......[OKAY][OKAY] - -[OKAY] -fused_adam ............. fused_adam[NO] .......fused_adamfused_adam .............[OKAY] -..........................[NO] fused_lamb[NO][NO]....... ....... .................... [OKAY] [OKAY] -[NO] -[OKAY] -.......fused_lamb fused_lamb [OKAY].............fused_lamb ............. - [NO] ............. [NO] ....... [NO] ....... [OKAY]....... - [OKAY][OKAY] -sparse_attn - ............ [NO] ....... [OKAY] -sparse_attntransformer sparse_attn ............sparse_attn ............ ........................[NO][NO] [NO] .......[NO].............. [OKAY]....... -[OKAY][OKAY][OKAY] - - -stochastic_transformer transformertransformertransformer ............. ............ ............[NO][NO][NO] ....... [NO]....... .......[OKAY] [OKAY] -[OKAY]....... - - [OKAY] -stochastic_transformer stochastic_transformer .stochastic_transformer .[NO] .[NO] ....... [NO] .......[OKAY]....... - [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name -op nameop name op name ................ ................................................installed .. installed installedinstalled compatible ...... - --------------------------------------------------compatiblecompatible -compatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [YES] cpu_adam......cpu_adam cpu_adam[OKAY]............... -............... ............... [YES][YES][YES] .................. fused_adam[OKAY] [OKAY] -.............[OKAY] - -[NO] ....... [OKAY] -fused_lambfused_adam .............fused_adam.............fused_adam [NO] [NO] .......................... ....... .......[NO] [NO] [OKAY] .......[OKAY] - -....... [OKAY]fused_lamb [OKAY] -............. - [NO] .......fused_lambfused_lamb sparse_attn [OKAY].......................... -............[NO] [NO][NO]....... .......[OKAY]....... - [OKAY][OKAY] -sparse_attn - transformer............ ............ [NO][NO] .......sparse_attn....... sparse_attn............[OKAY][OKAY] - -[NO]............ stochastic_transformertransformer....... [NO] . [OKAY]....... [NO]............ -[OKAY] -.......transformer[NO] transformer[OKAY] ............ -................... [NO][OKAY][NO] - ....... .......[OKAY]stochastic_transformer - [OKAY] -. stochastic_transformer[NO] stochastic_transformer ....... . . [OKAY] [NO] -.......[NO] [OKAY]....... - [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io ............... [NO] ....... transformer_inference[NO] -.. [NO] ....... [OKAY] -transformer_inferenceutils .................... [NO][YES] ............. [OKAY][OKAY] - -quantizer utils.............. ..................[NO] [YES]....... ......[OKAY] -[OKAY] ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] async_io....... [NO]............... - [NO] ....... [NO] -transformer_inference .. [NO] ....... transformer_inference[OKAY] -.. [NO] ....... utils[OKAY] -.................. [YES] ...... [OKAY] -utils .................. [YES] quantizer...... ..............[OKAY] -[NO] ....... quantizer[OKAY] -.............. [NO] ....... --------------------------------------------------[OKAY] - --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[YES] [YES]...... ......[OKAY] -[OKAY] -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io ............... transformer_inference[NO] ......... [NO][NO] -....... [OKAY] -utils .................. [YES] ...... [OKAY]transformer_inference - .. [NO] quantizer....... ..............[OKAY] -[NO] ....... [OKAY] -utils ..................-------------------------------------------------- -[YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils  [WARNING]  async_io: please install the libaio-devel package with yum.................. [YES] ...... -[OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] [WARNING]  async_io: please install the libaio-devel package with yum --------------------------------------------------- - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ......DeepSpeed general environment info: torch 1.8, cuda 11.1 - -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - - -async_ioasync_ioasync_io ............................................. [NO][NO][NO] ....... ....... ....... [NO] [NO] -[NO] - -transformer_inferencetransformer_inference .... [NO]transformer_inference[NO] ................ [OKAY][NO][OKAY] - -....... [OKAY] -utilsutils .................................... utils [YES] [YES] .................. ...... ...... [YES] [OKAY][OKAY] - -...... [OKAY] -quantizerquantizer ..............quantizer.............. [NO]..............[NO] ..............[NO] [OKAY][OKAY]....... - - [OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] async_io....... [NO] -............... [NO] ....... [NO] -transformer_inference .. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] -utils .................. utils[YES] ........................ [YES][OKAY] -...... [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY]-------------------------------------------------- - --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO]async_io -utils .................. [YES] ...... [OKAY] - ............... [NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO]utils ......................... [OKAY][YES] - ...... [OKAY] -utils .................. quantizer[YES] .................... [NO] [OKAY]....... - [OKAY] -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version DeepSpeed general environment info:..................... 11.2 - -deepspeed install path ........... torch install path['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -...............deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']...... - torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... DeepSpeed general environment info:[OKAY] - --------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[YES] [YES]...... ......[OKAY] -[OKAY] -quantizer ..............quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- --------------------------------------------------- -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]async_io ...................... [NO][NO] - ....... [NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - set_global_variables(extra_args_provider=extra_args_provider, - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - _GLOBAL_TOKENIZER = build_tokenizer(args) -torch version .................... 1.8.1 - self.encoder = json.load(open(vocab_file)) -torch cuda version ............... 11.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -async_io ............... [NO] ....... [NO] -torch version .................... 1.8.1 -transformer_inference .. [NO] ....... [OKAY] -torch cuda version ............... 11.1 -utils .................. [YES] ...... [OKAY] -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  async_io: please install the libaio-devel package with yum -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -/bin/sh: line 0: type: git: not found - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -/bin/sh: line 0: type: git: not found - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -_ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -.................... 1.8.1torch version - ....................torch cuda version 1.8.1............... - 11.1torch cuda version - nvcc version............... .....................11.1 -11.2nvcc version - deepspeed install path..................... ...........11.2 -deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']........... - deepspeed info ...................['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -0.5.5+cd7967d, cd7967d, master -deepspeed info deepspeed wheel compiled w.................... ......0.5.5+cd7967d, cd7967d, master -torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.encoder = json.load(open(vocab_file)) -self.encoder = json.load(open(vocab_file)) -FileNotFoundErrorFileNotFoundError: : [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json'[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -DeepSpeed general environment info: -async_io ............... [NO] ....... [NO] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -transformer_inference .. [NO] ....... [OKAY] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -utils .................. [YES] ...... [OKAY] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -quantizer .............. [NO] ....... [OKAY] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -/bin/sh: line 0: type: git: not found - initialize_megatron(extra_args_provider=extra_args_provider, -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version ............... 11.1 -nvcc version .................................... 11.111.2 - -nvcc versiondeepspeed install path ................................ 11.2 -deepspeed install path['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -........... deepspeed info ................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... - deepspeed infotorch 1.8, cuda 11.1 -................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found - _ = _build_tokenizer(args) -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -DeepSpeed general environment info: -/bin/sh: line 0: type: git: not found -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -async_io ............... [NO] ....... [NO] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [YES] ...... [OKAY] -quantizer ..............async_io [NO] ...................... [NO][OKAY] - ....... [NO] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO]utils ......................... [OKAY][YES] - ...... [OKAY] -utils .................. quantizer[YES] .................... [NO][OKAY] -....... [OKAY] -quantizer .............. [NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - initialize_megatron(extra_args_provider=extra_args_provider, - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer ..............quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - set_global_variables(extra_args_provider=extra_args_provider, -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - _ = _build_tokenizer(args) -/bin/sh: line 0: type: git: not found - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master - _GLOBAL_TOKENIZER = build_tokenizer(args) -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -torch install path ...............torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch version - .................... torch version1.8.1 -.................... 1.8.1torch cuda version - ............... torch cuda version11.1 -...............nvcc version 11.1..................... - 11.2nvcc version - deepspeed install path..................... ...........11.2 -deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']........... - deepspeed info ...................['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -0.5.5+cd7967d, cd7967d, master -deepspeed info deepspeed wheel compiled w.................... ......0.5.5+cd7967d, cd7967d, master -torch 1.8, cuda 11.1deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -/bin/sh: line 0: type: git: not found - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - [WARNING]  async_io: please install the libaio-devel package with yum - self.encoder = json.load(open(vocab_file)) - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -Traceback (most recent call last): -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install pathDeepSpeed general environment info:DeepSpeed general environment info: ............... - -torch install pathtorch install path ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'].............................. - torch version .................... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']1.8.1 - - -torch versiontorch cuda versiontorch version ....................................................... 11.11.8.11.8.1 - - -nvcc version torch cuda version.....................torch cuda version ...............11.2............... - 11.111.1deepspeed install path - - nvcc versionnvcc version........... .......................................... 11.211.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - - -deepspeed install pathdeepspeed install pathdeepspeed info ......................................... 0.5.5+cd7967d, cd7967d, master -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed wheel compiled w.['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed info......deepspeed info ...................torch 1.8, cuda 11.1................... - 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - set_global_variables(extra_args_provider=extra_args_provider,initialize_megatron(extra_args_provider=extra_args_provider, - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -set_global_variables(extra_args_provider=extra_args_provider, - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - set_global_variables(extra_args_provider=extra_args_provider, - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - [WARNING]  async_io: please install the libaio-devel package with yum - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -DeepSpeed general environment info: - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - self.encoder = json.load(open(vocab_file)) --------------------------------------------------- -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - _GLOBAL_TOKENIZER = build_tokenizer(args) File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - self.encoder = json.load(open(vocab_file))self.encoder = json.load(open(vocab_file)) - -FileNotFoundErrorFileNotFoundError : self.encoder = json.load(open(vocab_file))[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json': - -[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master - self.encoder = json.load(open(vocab_file)) -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found............... - [NO] ....... [OKAY] --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - _GLOBAL_TOKENIZER = build_tokenizer(args) -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -Traceback (most recent call last): -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - [WARNING]  async_io: please install the libaio-devel package with yum - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -/bin/sh: line 0: type: git: not found -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES]  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found....... -[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -quantizer .............. [NO] ....... [OKAY] -async_io-------------------------------------------------- -............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 - _ = _build_tokenizer(args) -nvcc version ..................... 11.2 - set_global_variables(extra_args_provider=extra_args_provider, File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - _GLOBAL_TOKENIZER = build_tokenizer(args) -_ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -_GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -DeepSpeed general environment info: - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -torch cuda version ............... 11.1 - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -/bin/sh: line 0: type: git: not found - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -torch version .................... 1.8.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, -initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider,set_global_variables(extra_args_provider=extra_args_provider, - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args)_ = _build_tokenizer(args) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) -_GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - self.encoder = json.load(open(vocab_file)) - self.encoder = json.load(open(vocab_file)) -FileNotFoundErrorFileNotFoundError: : [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -initialize_megatron(extra_args_provider=extra_args_provider, - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - set_global_variables(extra_args_provider=extra_args_provider, - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - [WARNING]  async_io: please install the libaio-devel package with yum - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - self.encoder = json.load(open(vocab_file)) -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - [WARNING]  async_io: please install the libaio-devel package with yum - _ = _build_tokenizer(args) - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - _ = _build_tokenizer(args) -async_io ............... [NO] ....... [NO] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -transformer_inference .. [NO] ....... [OKAY] - _GLOBAL_TOKENIZER = build_tokenizer(args) -utils .................. [YES] ...... [OKAY] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -DeepSpeed general environment info: -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -DeepSpeed general environment info: - _ = _build_tokenizer(args) -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -Traceback (most recent call last): - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -/bin/sh: line 0: type: git: not found - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, -initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - set_global_variables(extra_args_provider=extra_args_provider, -transformer_inference .. [NO] ....... [OKAY] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -utils .................. [YES] ...... [OKAY] - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -_ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - pretrain(train_valid_test_datasets_provider, model_provider, forward_step,pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - self.encoder = json.load(open(vocab_file)) - self.encoder = json.load(open(vocab_file)) - set_global_variables(extra_args_provider=extra_args_provider, -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - [WARNING]  async_io: please install the libaio-devel package with yum - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - self.encoder = json.load(open(vocab_file)) -async_io ............... [NO] ....... [NO] -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - self.encoder = json.load(open(vocab_file)) -self.encoder = json.load(open(vocab_file)) - self.encoder = json.load(open(vocab_file)) -FileNotFoundErrorFileNotFoundError: : [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json'[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -FileNotFoundError -: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -/bin/sh: line 0: type: git: not found - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, -set_global_variables(extra_args_provider=extra_args_provider, File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - set_global_variables(extra_args_provider=extra_args_provider, - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - _ = _build_tokenizer(args) File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - self.encoder = json.load(open(vocab_file)) - _GLOBAL_TOKENIZER = build_tokenizer(args) - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: FileNotFoundError[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - [WARNING]  async_io: please install the libaio-devel package with yum - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -async_io ............... [NO] ....... [NO] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -async_io ............... [NO] ....... [NO] - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO]utils ......................... [OKAY][YES] - ...... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -quantizerutils ................................ [NO][YES] ............. [OKAY][OKAY] - - _GLOBAL_TOKENIZER = build_tokenizer(args) ---------------------------------------------------quantizer - self.encoder = json.load(open(vocab_file)) - .............. [NO] ....... [OKAY] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer --------------------------------------------------- -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -/bin/sh: line 0: type: git: not found - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - _GLOBAL_TOKENIZER = build_tokenizer(args) -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - [WARNING]  async_io: please install the libaio-devel package with yum - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - self.encoder = json.load(open(vocab_file)) -utils .................. [YES] ...... [OKAY] -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - self.encoder = json.load(open(vocab_file)) -FileNotFoundError self.encoder = json.load(open(vocab_file)) -: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - initialize_megatron(extra_args_provider=extra_args_provider, -transformer_inference .. [NO] ....... [OKAY] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - set_global_variables(extra_args_provider=extra_args_provider, --------------------------------------------------- - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): -DeepSpeed general environment info: - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - set_global_variables(extra_args_provider=extra_args_provider, - _ = _build_tokenizer(args) -set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - _ = _build_tokenizer(args)initialize_megatron(extra_args_provider=extra_args_provider, - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -Traceback (most recent call last): - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)_ = _build_tokenizer(args) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - _GLOBAL_TOKENIZER = build_tokenizer(args) File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1513102.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.02 - init_method_xavier_uniform ...................... False - self.encoder = json.load(open(vocab_file)) - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json'self.encoder = json.load(open(vocab_file)) - - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... False - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - _ = _build_tokenizer(args) - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - _ = _build_tokenizer(args) File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - self.encoder = json.load(open(vocab_file)) - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - _GLOBAL_TOKENIZER = build_tokenizer(args)_GLOBAL_TOKENIZER = build_tokenizer(args) - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) -tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found - self.encoder = json.load(open(vocab_file)) -FileNotFoundError : _GLOBAL_TOKENIZER = build_tokenizer(args)[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, -DeepSpeed general environment info: - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 - initialize_megatron(extra_args_provider=extra_args_provider, -nvcc version ..................... 11.2 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master - _ = _build_tokenizer(args) -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - self.encoder = json.load(open(vocab_file))self.encoder = json.load(open(vocab_file)) - -FileNotFoundErrorFileNotFoundError: : [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json'[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - [WARNING]  async_io: please install the libaio-devel package with yum - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -/bin/sh: line 0: type: git: not found -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -/bin/sh: line 0: type: git: not found - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -/bin/sh: line 0: type: git: not found - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -/bin/sh: line 0: type: git: not found - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch version - .................... 1.8.1torch version - ....................torch cuda version ...............1.8.1 -11.1 -nvcc versiontorch cuda version ..................... ...............11.2 - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - deepspeed install path11.1 ........... - nvcc version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']..................... - deepspeed info11.2 -................... deepspeed install path0.5.5+cd7967d, cd7967d, master - deepspeed wheel compiled w............ ...... torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - [WARNING]  async_io: please install the libaio-devel package with yum - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - set_global_variables(extra_args_provider=extra_args_provider, - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - _ = _build_tokenizer(args) -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer --------------------------------------------------- - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -/bin/sh: line 0: type: git: not found - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - initialize_megatron(extra_args_provider=extra_args_provider, - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -Traceback (most recent call last): -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -/bin/sh: line 0: type: git: not found - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - initialize_megatron(extra_args_provider=extra_args_provider, - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - self.encoder = json.load(open(vocab_file)) - self.encoder = json.load(open(vocab_file)) -FileNotFoundError self.encoder = json.load(open(vocab_file)) -: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -FileNotFoundErrorFileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - _ = _build_tokenizer(args) -/bin/sh: line 0: type: git: not found - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -DeepSpeed general environment info: - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -/bin/sh: line 0: type: git: not found - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 - _GLOBAL_TOKENIZER = build_tokenizer(args) -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -/bin/sh: line 0: type: git: not found - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -/bin/sh: line 0: type: git: not found - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, -DeepSpeed general environment info: - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: - _ = _build_tokenizer(args) -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -Traceback (most recent call last): - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args)initialize_megatron(extra_args_provider=extra_args_provider, - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _GLOBAL_TOKENIZER = build_tokenizer(args) - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - set_global_variables(extra_args_provider=extra_args_provider, - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - _ = _build_tokenizer(args) - self.encoder = json.load(open(vocab_file)) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - _GLOBAL_TOKENIZER = build_tokenizer(args) - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -/bin/sh: line 0: type: git: not found - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - self.encoder = json.load(open(vocab_file)) -DeepSpeed general environment info: - _GLOBAL_TOKENIZER = build_tokenizer(args) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -torch version .................... 1.8.1 -torch cuda version ............... 11.1 - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+cd7967d, cd7967d, master -0.5.5+cd7967d, cd7967d, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -/bin/sh: line 0: type: git: not found - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -/bin/sh: line 0: type: git: not found - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -Traceback (most recent call last): - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) - self.encoder = json.load(open(vocab_file)) -FileNotFoundErrorFileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, -Traceback (most recent call last): - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - _ = _build_tokenizer(args) - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _GLOBAL_TOKENIZER = build_tokenizer(args) - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - _ = _build_tokenizer(args) - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _ = _build_tokenizer(args) - _GLOBAL_TOKENIZER = build_tokenizer(args) - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -DeepSpeed general environment info: - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -torch cuda version ............... 11.1 - _GLOBAL_TOKENIZER = build_tokenizer(args) - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -nvcc version ..................... 11.2 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master - _GLOBAL_TOKENIZER = build_tokenizer(args) -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - self.encoder = json.load(open(vocab_file)) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) -DeepSpeed general environment info: - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - self.encoder = json.load(open(vocab_file)) -torch cuda version ............... 11.1 -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - _GLOBAL_TOKENIZER = build_tokenizer(args) -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -/bin/sh: line 0: type: git: not found - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found - _ = _build_tokenizer(args) - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - initialize_megatron(extra_args_provider=extra_args_provider,initialize_megatron(extra_args_provider=extra_args_provider, - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) -tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -/bin/sh: line 0: type: git: not found - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - self.encoder = json.load(open(vocab_file)) - self.encoder = json.load(open(vocab_file)) -FileNotFoundErrorFileNotFoundError: : [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json'[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -/bin/sh: line 0: type: git: not found - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -utils .................. [YES] ...... [OKAY] - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - [WARNING]  async_io: please install the libaio-devel package with yum - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - _GLOBAL_TOKENIZER = build_tokenizer(args) -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -/bin/sh: line 0: type: git: not found - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -DeepSpeed general environment info: - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install path torch version............... .................... 1.8.1 - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -torch cuda version['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -............... 11.1 -torch version nvcc version.................... .....................1.8.1 -11.2 -deepspeed install pathtorch cuda version .......................... 11.1 -nvcc version['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -.....................deepspeed info 11.2................... - deepspeed install path0.5.5+cd7967d, cd7967d, master -...........deepspeed wheel compiled w. ...... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']torch 1.8, cuda 11.1 - -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - _ = _build_tokenizer(args) - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _GLOBAL_TOKENIZER = build_tokenizer(args) - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -/bin/sh: line 0: type: git: not found - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -/bin/sh: line 0: type: git: not found - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -/bin/sh: line 0: type: git: not found -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -/bin/sh: line 0: type: git: not found - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -/bin/sh: line 0: type: git: not found - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - _GLOBAL_TOKENIZER = build_tokenizer(args) - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - self.encoder = json.load(open(vocab_file)) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - _GLOBAL_TOKENIZER = build_tokenizer(args) - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - self.encoder = json.load(open(vocab_file))self.encoder = json.load(open(vocab_file)) - -FileNotFoundErrorFileNotFoundError: : [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json'[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ -/bin/sh: line 0: type: git: not found - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - self.encoder = json.load(open(vocab_file)) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - set_global_variables(extra_args_provider=extra_args_provider, - set_global_variables(extra_args_provider=extra_args_provider, File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _ = _build_tokenizer(args) - _ = _build_tokenizer(args) File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - _GLOBAL_TOKENIZER = build_tokenizer(args)_GLOBAL_TOKENIZER = build_tokenizer(args) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file))self.encoder = json.load(open(vocab_file)) - -FileNotFoundErrorFileNotFoundError: : [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json'[Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -/bin/sh: line 0: type: git: not found - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -/bin/sh: line 0: type: git: not found - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - set_global_variables(extra_args_provider=extra_args_provider, File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args)_ = _build_tokenizer(args) - - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) - self.encoder = json.load(open(vocab_file)) -FileNotFoundErrorFileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in -/bin/sh: line 0: type: git: not found - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', -initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Killing subprocess 192984 -Killing subprocess 192985 -Killing subprocess 192986 -Killing subprocess 192988 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/training.py", line 97, in pretrain - initialize_megatron(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/initialize.py", line 53, in initialize_megatron - set_global_variables(extra_args_provider=extra_args_provider, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 93, in set_global_variables - _ = _build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/global_vars.py", line 125, in _build_tokenizer - _GLOBAL_TOKENIZER = build_tokenizer(args) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 43, in build_tokenizer - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/tokenizer.py", line 274, in __init__ - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/tokenizer/gpt2_tokenization.py", line 164, in __init__ - self.encoder = json.load(open(vocab_file)) -FileNotFoundError: [Errno 2] No such file or directory: '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json' -Killing subprocess 2363481 -Killing subprocess 2363482 -Killing subprocess 2363483 -Killing subprocess 2363484 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 684099 -Killing subprocess 684100 -Killing subprocess 684101 -Killing subprocess 684102 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -Killing subprocess 183506 -Killing subprocess 183507 -Killing subprocess 183508 -Killing subprocess 183509 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 183548 -Killing subprocess 183549 -Killing subprocess 183550 -Killing subprocess 183551 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main -Killing subprocess 185348 - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code -Killing subprocess 185349 -Killing subprocess 185350 -Killing subprocess 185351 -Traceback (most recent call last): - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 179523 -Killing subprocess 179524 -Killing subprocess 179525 -Killing subprocess 179526 -Traceback (most recent call last): - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main -Killing subprocess 182881 -Killing subprocess 182882 -Killing subprocess 182883 -Killing subprocess 182884 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code -Killing subprocess 183047 -Killing subprocess 183048 -Killing subprocess 183049 -Killing subprocess 183050 - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - return _run_code(code, main_globals, None, - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr main() -', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 178625 -Killing subprocess 178626 -Killing subprocess 178627 -Killing subprocess 178628 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler -Killing subprocess 332558 -Killing subprocess 332559 -Killing subprocess 332560 -Killing subprocess 332561 -Traceback (most recent call last): - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -Killing subprocess 207830 - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main -Killing subprocess 207831 -Killing subprocess 207832 -Killing subprocess 207833 -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -Killing subprocess 368968 -Killing subprocess 368969 -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lrKilling subprocess 368970 -Killing subprocess 368971 -', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -Killing subprocess 195740 -Killing subprocess 195741 -Killing subprocess 195742 -Killing subprocess 195743 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - sigkill_handler(signal.SIGTERM, None) # not coming back - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 229259 -Killing subprocess 229260 -Killing subprocess 229261 -Killing subprocess 229262 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 229024 -Killing subprocess 229025 -Killing subprocess 229026 -Killing subprocess 229027 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -Killing subprocess 295501 -Killing subprocess 295502 -Killing subprocess 295503 -Killing subprocess 295504 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 473231 -Killing subprocess 473232 -Killing subprocess 473233 -Killing subprocess 473234 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 185789 -Killing subprocess 185790 -Killing subprocess 185791 -Killing subprocess 185792 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main -Killing subprocess 306062 -Killing subprocess 306063 -Killing subprocess 306064 -Killing subprocess 306065 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -Killing subprocess 807367 -Killing subprocess 807368 -Killing subprocess 807369 -Killing subprocess 807370 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -Killing subprocess 804449 -Killing subprocess 804450 -Killing subprocess 804451 -Killing subprocess 804452 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 2641588 -Killing subprocess 2641589 -Killing subprocess 2641590 -Killing subprocess 2641591 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 189620 -Killing subprocess 189621 -Killing subprocess 189622 -Killing subprocess 189623 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 189056 -Killing subprocess 189057 -Killing subprocess 189058 -Killing subprocess 189060 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main -Killing subprocess 2205655 -Killing subprocess 2205656 -Killing subprocess 2205657 -Killing subprocess 2205659 - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 208133 -Killing subprocess 208134 -Killing subprocess 208135 -Killing subprocess 208137 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -Killing subprocess 183145 -Killing subprocess 183146 -Killing subprocess 183147 -Killing subprocess 183149 -Killing subprocess 1539973 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main -Killing subprocess 1539974 -Killing subprocess 1539975 -Killing subprocess 1539977 - main() -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler -Killing subprocess 2694736 -Killing subprocess 2694737 -Killing subprocess 2694738 -Killing subprocess 2694740 - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code -Traceback (most recent call last): - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -Killing subprocess 227976 -Killing subprocess 227977 -Killing subprocess 227978 -Killing subprocess 227980 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main -Killing subprocess 1819600 -Killing subprocess 1819601 -Killing subprocess 1819602 -Killing subprocess 1819604 - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--clip-grad', '1.0', '--fp16', '--checkpoint-activations', '--no-masked-softmax-fusion', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '1190', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1513102.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -srun: error: r8i3n4: task 16: Exited with exit code 1 -srun: Terminating job step 1513102.0 -srun: error: r8i4n3: task 24: Exited with exit code 1 -srun: error: r6i3n0: task 0: Exited with exit code 1 -srun: error: r7i7n1: task 9: Exited with exit code 1 -srun: error: r8i4n6: task 27: Exited with exit code 1 -srun: error: r8i3n2: task 14: Exited with exit code 1 -srun: error: r8i4n2: task 23: Exited with exit code 1 -srun: error: r8i3n5: task 17: Exited with exit code 1 -srun: error: r8i4n5: task 26: Exited with exit code 1 -srun: error: r7i6n4: task 3: Exited with exit code 1 -srun: error: r8i3n1: task 13: Exited with exit code 1 -srun: error: r8i3n8: task 20: Exited with exit code 1 -srun: error: r8i5n0: task 30: Exited with exit code 1 -srun: error: r7i6n8: task 7: Exited with exit code 1 -srun: error: r7i6n7: task 6: Exited with exit code 1 -srun: error: r7i6n5: task 4: Exited with exit code 1 -srun: error: r6i3n1: task 1: Exited with exit code 1 -srun: error: r6i3n2: task 2: Exited with exit code 1 -srun: error: r8i3n7: task 19: Exited with exit code 1 -srun: error: r7i6n6: task 5: Exited with exit code 1 -srun: error: r8i4n1: task 22: Exited with exit code 1 -srun: error: r8i4n0: task 21: Exited with exit code 1 -srun: error: r8i2n8: task 11: Exited with exit code 1 -srun: error: r8i5n1: task 31: Exited with exit code 1 -srun: error: r8i4n7: task 28: Exited with exit code 1 -srun: error: r8i3n0: task 12: Exited with exit code 1 -srun: error: r8i3n3: task 15: Exited with exit code 1 -srun: error: r8i4n4: task 25: Exited with exit code 1 -srun: error: r7i7n0: task 8: Exited with exit code 1 -srun: error: r8i3n6: task 18: Terminated -srun: error: r8i4n8: task 29: Terminated -srun: error: r8i2n7: task 10: Exited with exit code 1 -srun: Force Terminated job step 1513102.0 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja-------------------------------------------------- - - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja -JIT compiled ops requires ninja --------------------------------------------------- - -JIT compiled ops requires ninja - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninja - - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------JIT compiled ops requires ninja --------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninja - - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.------------------------------------------------------------------------------------------------------------------------------------------------------ - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -JIT compiled ops requires ninja------------------------------------------------------------------------------------------------------------------------------------------------------ - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -------------------------------------------------------------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system --------------------------------------------------- -JIT compiled ops requires ninja - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report ---------------------------------------------------JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------JIT compiled ops requires ninja - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................................... [OKAY] [OKAY] -[OKAY][OKAY] --------------------------------------------------- - - ------------------------------------------------------------------------------------------------------------------------------------------------------- -ninjaninjaninjaninja .................. .................. .................................... [OKAY] -ninjaninjaninja ninja...................................................... [OKAY]..................[OKAY][OKAY] - - -op name - - --------------------------------------------------[OKAY]---------------------------------------------------------------------------------------------------- - - - -op nameop nameop name-------------------------------------------------- -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - -[OKAY] - -op name op nameop name................................ ................................installedinstalled installed..installed.. ..compatiblecompatible.. - - --------------------------------------------------compatible-------------------------------------------------- - -compatible -[OKAY][OKAY][OKAY]-------------------------------------------------- - - - -op name-------------------------------------------------- ---------------------------------------------------................--------------------------------------------------op name - -installed................op name op name ..installed ................ ..................compatible installed - compatible -------------------------------------------------- -................................ ................op nameinstalled installed................installed .. installed..compatible.. - ..compatible--------------------------------------------------compatible --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - --------------------------------------------------- - --------------------------------------------------- -installed--------------------------------------------------.. - - ..compatible -compatible-------------------------------------------------- - --------------------------------------------------- - --------------------------------------------------- -compatible - ----------------------------------------------------------------------------------------------------- - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system -op nameop nameop name op name ................................ ................ ................ installed installedinstalledinstalled .. .... .. compatible compatiblecompatible - -compatible ----------------------------------------------------------------------------------------------------- -cpu_adamcpu_adam cpu_adam ...............cpu_adam ............... ............... [YES] [YES]............... [YES] ...... ......[YES] ...... [OKAY][OKAY]...... - - [OKAY][OKAY] - -cpu_adamcpu_adam .............................. [YES][YES] cpu_adam ...... ......cpu_adam ...............[OKAY] - [OKAY][YES]............... -cpu_adam cpu_adam............... ...............[YES] cpu_adam[YES]......cpu_adam .................................... [OKAY] [OKAY] -[YES][YES] - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - ---------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at --------------------------------------------------- - --------------------------------------------------- - - ......[YES] [OKAY]...... - [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - - ............ [OKAY] -[OKAY]fused_adam -op nameop nameop name ................op name ................ ................ ................installed installed installed installed.. .. .. ..compatible compatible -compatible -compatible --------------------------------------------------- --------------------------------------------------- - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -cpu_adamcpu_adamcpu_adamcpu_adam ............................................. ............... [YES] [YES][YES][YES] ...... ...... ......[OKAY]...... [OKAY] - [OKAY] -[OKAY] -fused_adamfused_adam fused_adamfused_adam ............. ............. ..........................[NO] [NO][NO].......[NO] ..............[OKAY]....... [OKAY][OKAY] - -[OKAY] - -fused_adam ............. [NO]fused_adam .................... [OKAY][NO] --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................ ................................ ................ installedinstalled installed..installed.. compatible....compatible - - .............fused_adam [NO]............. .......[NO] [OKAY]....... - [OKAY]fused_adamfused_adam --------------------------------------------------- --------------------------------------------------- - - -fused_lambfused_lambfused_lambfused_lamb ....................................... ............. [NO] [NO][NO] [NO] ............................ [OKAY][OKAY][OKAY][OKAY] - - - - fused_adam....... fused_adamfused_lamb[OKAY]............. - --------------------------------------------------compatiblecompatible-------------------------------------------------- - - - --------------------------------------------------- --------------------------------------------------- - fused_lamb.............fused_lamb .......................... [NO].............[NO][NO] .......[NO] .............. ....... [OKAY][OKAY] -[OKAY] -[OKAY] - -cpu_adam cpu_adam............... cpu_adam ...............cpu_adam[YES] [YES]..................... ............... ......[YES] [OKAY] [YES] - ...... [OKAY] ...... -[OKAY] -fused_adam .............fused_adam fused_adamfused_adam.............[NO] [NO] ....... ............. ....................[NO] [NO][OKAY][OKAY]....... - -.......[OKAY] - .............[NO]............. fused_lamb[NO] [NO] ....... .................... ....... [OKAY][NO] -[OKAY][OKAY]....... -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY] - -cpu_adam ...............cpu_adamcpu_adam cpu_adam..............................[YES] ...... ...............[YES] [YES] [OKAY][YES] ...... - ............[OKAY] -[OKAY][OKAY] - -fused_lamb .............fused_lamb [NO] .................... [OKAY][NO] -[OKAY] -[OKAY]fused_lambfused_lamb -sparse_attnsparse_attnsparse_attnsparse_attn ........................ ............ ............ [NO][NO] [NO] .......[NO].............. [OKAY].......[OKAY][OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - -[OKAY]fused_lamb -[OKAY][OKAY] ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op name -op nameop name op name ................................ ................ ................installedinstalled ..installedinstalled.. compatiblecompatible.... - -fused_adam .............fused_adam fused_adam[NO] fused_adam ................................. ............. [NO][OKAY] [NO] -sparse_attnsparse_attn ............................... [OKAY][NO][NO] - .............. [OKAY][OKAY] - -fused_adam .............fused_adamfused_adam [NO]..........................fused_adam .......[NO][NO]............. [OKAY].............. -[NO] [OKAY][OKAY]....... - - fused_lamb ............. .............fused_lamb ............. [NO][NO]............. [NO] ..............[NO] [OKAY].......[OKAY]....... - - [OKAY][OKAY] - - -[OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name - .............fused_lamb .............[NO] [NO]....... .......[OKAY] -sparse_attn[OKAY] sparse_attn - --------------------------------------------------compatible-------------------------------------------------- -compatible - - --------------------------------------------------- --------------------------------------------------- -[NO] fused_lamb .............. ....... .............[OKAY] [OKAY] -[OKAY][NO] - -transformertransformer sparse_attn........................ ............[NO]sparse_attn [NO] ............ [NO] ....... [NO].............. [OKAY][OKAY].......[OKAY] - - - [OKAY] - fused_lamb[OKAY] fused_lambfused_lamb............. - -transformertransformertransformertransformer ................................................ [NO][NO][NO] [NO] ....... ..................... [OKAY][OKAY][OKAY][OKAY] - - - - op name ................ op name................................installed ..................installedinstalled ....compatibleinstalled -............ ............[NO] [NO]....... .......[OKAY] -[OKAY]sparse_attn -cpu_adam ...............cpu_adam cpu_adam[YES]cpu_adam............... .....................[YES]............... [OKAY]......[YES] -[YES] ......[OKAY]...... - ....... fused_lamb[OKAY]fused_lambfused_lamb -transformer ............stochastic_transformertransformer stochastic_transformer[NO] ..................... [NO] [OKAY][NO] -[NO] ....... ....... ....... [OKAY] [OKAY]stochastic_transformer - - [OKAY] - ..........................[NO] [NO]fused_lamb[NO] ....... ....... ....................[OKAY] -[NO][OKAY][OKAY] - -....... [OKAY] -sparse_attn sparse_attn............ [NO]............sparse_attnsparse_attn ............[NO]................... [NO].......[NO][OKAY] .......[OKAY] - -stochastic_transformer stochastic_transformerstochastic_transformerstochastic_transformer . ..[NO] .[NO] [NO] ....... [NO]....... ....... [OKAY] ....... -[OKAY] [OKAY] - compatible--------------------------------------------------compatible -.. - --------------------------------------------------- -------------------------------------------------- - - transformertransformer ............ sparse_attn........................ [NO][NO] ............[NO]....... ....... [NO] .......[OKAY] [OKAY][OKAY] - -....... -ninjaninjaninjaninja .................. .................. .................................... [OKAY] [OKAY][OKAY] -[OKAY] - - [OKAY][OKAY] - - ....................................... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -ninjaninjaninja ...................................................... [OKAY][OKAY][OKAY] - - -stochastic_transformer. [NO] ........ [OKAY][NO] -sparse_attn ............sparse_attn sparse_attn [NO] sparse_attn............ ............ .......[NO]............ [NO][OKAY].......[NO] -....... transformertransformer [OKAY] ............[OKAY] - ............ -[OKAY] - -compatible --------------------------------------------------- -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - -transformer stochastic_transformer[OKAY]............stochastic_transformer --------------------------------------------------- - -----------------------------------------------------------------------------------------------------op name --------------------------------------------------- - -fused_adam ............. [NO] .......fused_adam [OKAY]fused_adam.............fused_adam -sparse_attn ............ [NO] ....... [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -ninjaop nameop nameop name ................................................ installedinstalled installed .................... .. ..[OKAY]compatiblecompatible - - - ....... [OKAY] - .......[OKAY]....... - [OKAY]transformer[OKAY] - -[NO] transformer[NO]transformer....... ...............................[OKAY] -[NO][NO][OKAY] -cpu_adamcpu_adamcpu_adam ............... ..............................cpu_adam [YES] [YES][YES] ..................... [OKAY]............[YES] -[OKAY] --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------op name - - .[NO] transformer[NO] . ....... ................... [NO] [NO] [OKAY] [OKAY] -....... -op name................ op nameop name................installed .. ................installed................compatible - ..--------------------------------------------------installedinstalled - compatible.... - [NO].......................... .......[NO][NO]fused_lamb [OKAY].............. ............. -[OKAY][OKAY] - -transformer sparse_attnsparse_attn............sparse_attn [NO] ........................................... [OKAY] [NO][NO][NO] -compatible ----------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -............ transformertransformer[NO] transformer ............ ................... ............ [NO] [NO] [OKAY] [NO]....... - ..............[OKAY] -[OKAY][OKAY]stochastic_transformer -ninjaninjaninja ninja...................................................... [OKAY] ..................[OKAY] -[OKAY][OKAY] - -..............stochastic_transformer [OKAY][OKAY]stochastic_transformer - - [OKAY][OKAY]...... - - [OKAY] - op nameop name................ op name ................installed................ ................installedinstalled .. ..installed .. compatiblecompatiblecompatible.. - - - ------------------------------------------------------------------------------------------------------------------------------------------------------compatible - - - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - .......[OKAY] -stochastic_transformer[OKAY] - --------------------------------------------------compatible - -compatiblecpu_adam --------------------------------------------------- ............... --------------------------------------------------- -[NO]fused_lamb fused_lamb....................fused_lamb .............[NO][OKAY]............. - ..................... [OKAY]stochastic_transformer -[OKAY] [OKAY] - -op namecpu_adam cpu_adam ................cpu_adam .............................. installed ............... [YES] .. [YES] [YES] ......compatible ...... -[OKAY] ......--------------------------------------------------[OKAY] - - -[OKAY] - -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY] [OKAY][OKAY] - - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -. [NO].stochastic_transformer stochastic_transformer ....... [NO] ..[OKAY] ....... -[NO] [NO] [OKAY] ....... -fused_adam ............. [NO] fused_adamfused_adam.......fused_adam ..........................[OKAY] [NO]............. --------------------------------------------------- -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY] -[OKAY] --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - ----------------------------------------------------------------------------------------------------- -op nameop nameop name -. [NO]stochastic_transformer ....... .[OKAY] -cpu_adam[YES] ..................... [OKAY][YES]cpu_adam - cpu_adam..................... ...............[OKAY][YES] -[NO]....... [NO] ....... [OKAY] ....... -[OKAY] -[OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - -.transformer transformertransformer[NO]............ ....... ........................[NO][OKAY] -cpu_adam ...............fused_adamfused_adam [YES] fused_adam............. ............. ...... [NO] ............. [NO][OKAY] .......[NO] -stochastic_transformer .stochastic_transformer stochastic_transformer.[NO] .[NO]....... . [NO].......[OKAY] -.......[NO][OKAY] -[OKAY]....... - [OKAY] --------------------------------------------------- - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................................................................ installedinstalledinstalled installed .. ...... compatiblecompatiblecompatible - -compatible ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -....... [OKAY][OKAY] - - [NO] ....... [NO] fused_lamb[OKAY]....... -....................[OKAY] -cpu_adamcpu_adamcpu_adam cpu_adam............................................. ...............[YES][YES][YES] ...... [YES][OKAY]............ -[OKAY] ----------------------------------------------------------------------------------------------------- - -op name - ................op name................................ installed................installedinstalled installed.... .. compatible..compatiblecompatible - -[NO] ....... [OKAY] -ninjaninja ninjaninja.................. ...................................................... [OKAY] [OKAY] - - [YES]...... ......[OKAY] -[OKAY]fused_adam -sparse_attn ............ [NO] ....... sparse_attn[OKAY] ............sparse_attn - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op nameop nameop name -[NO][NO]....... ..............[OKAY] [OKAY] - -[OKAY] - .......[OKAY]....... - [OKAY][OKAY] - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op name op nameop nameop name................ ................................installed................ ..installedinstalledinstalled compatible...... -op nameop nameop name op name................ ................installed................ ................ installedinstalled .... installed.. compatible --------------------------------------------------- - -[NO][OKAY]fused_lamb -fused_lamb ....... ............. .............[OKAY]fused_lamb[NO] - [OKAY][OKAY]...... - - [OKAY] - op name --------------------------------------------------................op name -................ installed................op nameinstalled installed .... ................ .. compatible - ---------------------------------------------------compatible-------------------------------------------------- - --------------------------------------------------- - -[OKAY][OKAY]---------------------------------------------------------------------------------------------------- - - - - ............. [NO] .......fused_adam [OKAY]............. -ninjaninjaninjaninja .................................... .................. ..................[OKAY] -[OKAY][OKAY][OKAY]-------------------------------------------------- - -sparse_attn transformer[NO]........................ ...................[NO][NO] [OKAY] - [NO].............. transformer.......[OKAY] -................ ................................ op name installedinstalled installed ...................... compatibleinstalledcompatible - -compatible--------------------------------------------------.. --------------------------------------------------- --------------------------------------------------- - -compatible --------------------------------------------------- -stochastic_transformerstochastic_transformerstochastic_transformer .. . [NO] [NO] [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] -fused_lamb ............. fused_lamb[NO] .............fused_lambfused_adam....... [NO]..........................[OKAY] - .......[NO][NO] [OKAY].............. -op name op nameop nameop name................ ................................installed................ installedinstalled..installed compatible -...... -------------------------------------------------- -compatiblecompatiblecompatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------compatiblecompatiblecompatible - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -ninjaninjaninja ninja .................. ..................[OKAY] .................. -compatiblecompatible..-------------------------------------------------- - - ---------------------------------------------------compatible-------------------------------------------------- - - -cpu_adam ...............cpu_adamcpu_adam cpu_adam [YES]............... ............... ............... ......[YES][YES] ......[OKAY]...... [YES] -[OKAY][OKAY] - -...... [OKAY] - .................... [NO] [OKAY] [NO] -fused_adam ............. [NO] ....... fused_adamfused_adam[OKAY]fused_adam -compatiblecompatible--------------------------------------------------installed - - - ----------------------------------------------------------------------------------------------------.. - -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY][OKAY] - - - --------------------------------------------------- -----------------------------------------------------------------------------------------------------op name -op name - op name................op name ................ installed................ ................ installed installed .. installed .... compatible ..compatible - [NO]fused_adam fused_adam....................fused_lamb [NO][OKAY].......................... - - ------------------------------------------------------------------------------------------------------------------------------------------------------- -op name - - ............ [OKAY] [OKAY]transformer - [NO] -cpu_adamcpu_adam ...............cpu_adam...............cpu_adam [YES].............................. [YES] ......[YES] ......[YES]......[OKAY] ......[OKAY] - -[OKAY] - - [OKAY][OKAY] - - - -cpu_adam ...............cpu_adamcpu_adamcpu_adam [YES] ................................................... [YES][OKAY][YES] - .................. --------------------------------------------------[OKAY] [OKAY] - -[OKAY] --------------------------------------------------- -fused_adam .............fused_adamfused_adam fused_adam.............[NO] [NO]....... .......................... [OKAY] -....... .......[OKAY] -[OKAY] - ............. ..........................[NO] fused_lamb [NO] [NO]....... ............. .............. [OKAY] [OKAY][NO] - -[OKAY] - compatible --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op name -cpu_adamcpu_adam cpu_adam cpu_adam ............................................. ............... [YES][YES] [YES] [YES].................. ......[OKAY][OKAY][OKAY] - -[OKAY] -compatible -compatible-------------------------------------------------- - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - - .......[NO][NO] [OKAY].......fused_lamb....... - [OKAY][OKAY]............. - - op nameop name................op name ................ installed................................ installed..installed installed .. compatible ....compatible - - compatible---------------------------------------------------------------------------------------------------- -compatible - -............ transformer ....... stochastic_transformer[NO]............[OKAY] - .......[NO] . [OKAY].......[NO]stochastic_transformer -[OKAY] -ninjaninjaninja ninja...................................................... ..................[OKAY][OKAY] [OKAY] - -[OKAY] -sparse_attn fused_lamb............ .............[NO] sparse_attn [NO] ....... sparse_attn................... ............[OKAY][OKAY] [NO] - -cpu_adam ............... [YES] cpu_adamcpu_adam...... cpu_adam ...............[OKAY] ............... ............... -[YES] ...... ...... ...... [OKAY] [OKAY] -[OKAY] - -op name -----------------------------------------------------------------------------------------------------................ - - --------------------------------------------------op nameinstalledop name -cpu_adam ...............cpu_adam cpu_adam [YES]cpu_adam ............... ..................... ............... [YES][YES][OKAY][YES] -.......[NO][NO] fused_lamb[OKAY].............. -............. [OKAY] [OKAY][NO] -fused_lamb -sparse_attn ............ [NO]sparse_attn ................... [OKAY][NO] -....... fused_lamb[OKAY]fused_lamb fused_lamb -cpu_adam cpu_adam............... cpu_adam...............[YES] ...............cpu_adam......[YES] [YES][OKAY]...... - op nameop name................ op name ................................ installed installed ................installed .... .. installedcompatible compatible -compatible -..-------------------------------------------------- - --------------------------------------------------- ---------------------------------------------------compatible - --------------------------------------------------- - -cpu_adam ...............cpu_adam cpu_adamcpu_adam[YES] ................................................... [YES] [YES][OKAY] [YES] -fused_lamb [NO] .............fused_lamb....... [NO].............[OKAY] - --------------------------------------------------- --------------------------------------------------- - [OKAY]....... -fused_adam fused_adam.............fused_adamfused_adam ............. [NO]............. .............[NO] .......[NO] [OKAY][NO].............. ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- -op name -[NO] .............. [OKAY][OKAY] - -[YES] [YES] [YES] ...... ...... ...... [OKAY][OKAY] - -[OKAY] -fused_adam ............. fused_adamfused_adamfused_adam[NO] .......................... ............. [NO] [NO] [NO]....... ....... .......[OKAY][OKAY] ....... - ................op name .. ................ ................installed compatible - installed..--------------------------------------------------installed -............ ......[OKAY][OKAY] - -[OKAY] - .................... fused_lamb[OKAY]fused_lamb[NO] -sparse_attn sparse_attn transformer....... ............ ............ [OKAY]............ - ............. ..........................[NO] [NO] [NO]....... ....... ....... [OKAY] [OKAY]sparse_attn - -.....................[OKAY] - [YES][OKAY] -cpu_adam ............... cpu_adamcpu_adam[YES] cpu_adam .............................. ...... ...............[YES] [YES][OKAY] -fused_adamfused_adam fused_adam fused_adam............. ............. [NO][NO]............. ............. ....... ....... [NO][OKAY] [NO] -[OKAY] ....... - .................. [OKAY][OKAY][OKAY] - - -.......[NO]sparse_attn [OKAY] ................... -cpu_adamcpu_adam ...............cpu_adam............... cpu_adam [YES][YES] ............... ............... ...... [YES] [OKAY] ......[YES] -...... [OKAY]......[OKAY] -stochastic_transformer . [OKAY]stochastic_transformer[NO] - [OKAY].......[OKAY] -fused_lamb - .............[OKAY] -op name op nameop name ................ ................ ................................installed installedinstalled..installed ..compatible.. compatible.. -transformer ............ transformer[NO]transformer ................... sparse_attn............[NO][OKAY] ............[NO]....... -fused_adam ............. [NO] ....... [OKAY]fused_adamfused_adam - - [OKAY][OKAY] - - compatible.... -fused_adam ............. [NO] ....... [OKAY]fused_adamfused_adam - ................................. [NO][OKAY][NO] - .............. [OKAY][OKAY] - - [NO][NO][NO] transformer ....... .......................... [NO][OKAY][OKAY][OKAY] - -[OKAY] -............ [NO] ....... [OKAY] -...... [OKAY] -[YES] .................. [OKAY][OKAY] -[OKAY] - - .......fused_lamb[OKAY] - .............[OKAY]fused_lamb -fused_adam ............. [NO] ....... [OKAY]fused_adam - [OKAY][NO] - sparse_attn....... ............[OKAY] - -[OKAY] -. ........[NO] [OKAY] -[NO]....... .......[OKAY] -[OKAY] -fused_lamb[NO]fused_lamb fused_lamb....... ....................................... [NO][OKAY][NO] [NO] -compatible - -------------------------------------------------- --------------------------------------------------- -compatible --------------------------------------------------- - --------------------------------------------------- - .......[NO][OKAY] stochastic_transformer -.......[OKAY] -[OKAY].stochastic_transformer - fused_adam............. ............. ............. fused_lamb[NO][NO] ........................... [NO] [OKAY][NO][OKAY]....... - -fused_lamb .............fused_lamb fused_lamb fused_lamb [NO] ............. ............. .................... [NO] [NO] [NO][OKAY] ....... - --------------------------------------------------compatiblecpu_adamcompatible - - - fused_adam.............fused_lamb............. ..........................[NO][NO] .......[NO] ....... [NO] [OKAY] [OKAY] -....... -sparse_attn ............ [NO] ....... [OKAY]sparse_attn - -.......transformertransformer [OKAY] ............ -transformer ............ sparse_attn[NO]sparse_attnsparse_attn ............ ............................... [NO] [OKAY] [NO][NO] -fused_adam ............. [NO]fused_adam .................... [OKAY][NO] -fused_adam fused_adam.............fused_adam fused_adam [NO] .......................... ............. [NO][NO][NO] ..................... ....... [OKAY][OKAY] - [NO]............. fused_lamb .......fused_lamb [NO]............. [OKAY] -....................[NO] [NO][OKAY] -fused_adamfused_adam fused_lamb....................................... .............[NO][NO][NO] [NO].............. ....... [OKAY].......[OKAY][OKAY] - - -[NO] ....... transformer[OKAY] -fused_adam ............. [NO]fused_adam ....................fused_adamfused_adam [OKAY][NO].......................... - ....... ....... ....... [OKAY][OKAY] -[OKAY] - -cpu_adamcpu_adam cpu_adam .............................. [YES]cpu_adam...............[YES] ...... [YES][OKAY]..................... - [NO]stochastic_transformer . transformer....... ............. [NO] [OKAY] [NO] - ....... [OKAY]fused_lamb[OKAY] -fused_lamb -....... .......[OKAY] -[OKAY][OKAY] - -...............-------------------------------------------------- -------------------------------------------------- -[YES] - cpu_adam...... ...............[OKAY] -....... [OKAY]fused_lambfused_lamb -[OKAY] - ............ transformer[NO] sparse_attnsparse_attn ............ .......[NO]........................ .......[OKAY] [NO] -............ [NO]stochastic_transformer[NO] stochastic_transformer............... [NO] [OKAY]. -.......[OKAY] -[OKAY]stochastic_transformer[NO] -....... [OKAY]..............stochastic_transformer - [OKAY][OKAY] - - fused_adam....... fused_adam.............fused_lamb[OKAY] -[OKAY][OKAY] - - -.............. [OKAY][OKAY] - -[OKAY] -............sparse_attn transformer [NO]sparse_attn ............................... ............[NO][OKAY] -.......[NO][NO] [OKAY]fused_lamb -sparse_attn ............ [NO] ....... [OKAY] - ......[OKAY][YES] - ......[OKAY] -[OKAY] -[NO] ....... ....... ....... [OKAY] [OKAY] - -[OKAY] - .......................... [NO][NO]fused_lamb ........................... [OKAY][OKAY][NO] - - ....... [OKAY] -sparse_attn ............ [NO] sparse_attn....... sparse_attn ............ [OKAY] sparse_attn............ - [YES] ......cpu_adam [OKAY]cpu_adam............... -.......................... [NO][NO] fused_lamb.............. .............[OKAY][OKAY] sparse_attn - -[NO] [OKAY]transformer....... - .......[OKAY]............ -stochastic_transformer ....... .[OKAY]. -.transformer transformer............[NO]transformer ............[NO]................... [NO] .......[OKAY] [NO] ....... -[OKAY] -.......[OKAY] -.............[NO]............. fused_lamb ....... [NO] [NO][OKAY].................... -fused_lamb .............fused_lambfused_lamb fused_lamb .............[NO] ..........................[NO] [NO][NO]....... ..............[OKAY] .......[OKAY] - -sparse_attn ............ [NO] ....... [OKAY]sparse_attn -fused_lambfused_lamb fused_lamb............. .............[NO]............. [NO].......[NO] .......[OKAY].......sparse_attn -[OKAY] ............ -[NO] [NO].............. stochastic_transformer .......[OKAY][OKAY] - -[OKAY]. -.............. .............fused_lamb[OKAY][OKAY] -[NO] -............. .......[NO] fused_lamb [OKAY] .......fused_lamb -sparse_attnsparse_attn transformersparse_attn............ ............ ............ [NO]............ [NO] [NO].......[NO] ....... ....... [OKAY] ....... -[OKAY] [OKAY] - -[OKAY] -fused_adam ............. [NO] ....... [OKAY]fused_adamfused_adam -stochastic_transformer . [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY]sparse_attn -[NO] ............[NO]transformer....... [OKAY][NO]............ - ...............[YES] fused_adam[YES] ......................... fused_adam [OKAY][NO] [OKAY] - [NO]............ [NO]....... ....... [OKAY][OKAY] - - stochastic_transformer[OKAY][NO] transformer - . ....... ............transformer [NO] [OKAY]...................[NO] - [NO][NO] .............. [OKAY][OKAY] - -[OKAY] - .......[NO][OKAY]fused_lamb ....... - [OKAY]............. - [OKAY][NO] -[OKAY][OKAY] - - sparse_attn............sparse_attn transformer ........................ [NO] ............[NO][NO] ....... ....... [NO]....... [OKAY] [OKAY] -.......[OKAY] -[OKAY] -[NO] ....... [OKAY] - transformer[NO] stochastic_transformertransformer ............ ....... ............. [NO][OKAY] [NO] - ............. [OKAY] ............. -transformertransformertransformer stochastic_transformer ............ ............ ............[NO] [NO]. [NO] ..............[NO] .......[OKAY]....... - [OKAY] [OKAY] -[OKAY] - fused_adam.......................... fused_lamb............. [NO] [NO] .............[NO] [NO]....... .............. ....... [OKAY][OKAY] -[OKAY] -sparse_attn ............transformer sparse_attn............ [NO] ............ ............[NO][NO]....... .......[NO].......[OKAY] -.......[OKAY][OKAY] - -[OKAY]transformer -....... ....... transformer[NO] [OKAY]............[OKAY] .......[NO] - - [OKAY]....... - [OKAY]transformertransformer - -.................... [NO][OKAY] -transformersparse_attnsparse_attn .................................... [NO][NO][NO] sparse_attn....... ....... ....... ............[OKAY][OKAY] - [OKAY] - [OKAY][NO]....... - stochastic_transformer ....... [OKAY] -stochastic_transformer stochastic_transformer. stochastic_transformer .[NO] . [NO] ....... [NO] ....... [OKAY] ....... -[OKAY] -[OKAY] - ....... [OKAY]fused_lamb -sparse_attnsparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY]sparse_attn ............ -[OKAY]............[NO] - -transformer[OKAY] transformer -transformer ............sparse_attn [NO]sparse_attn ................... sparse_attn [NO] ............[OKAY] ............ - .......[NO][NO] stochastic_transformer[OKAY].............. -[NO] ....... ....... ....... [OKAY][OKAY] - -[OKAY] -[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer -[OKAY] - - transformer............stochastic_transformer transformer ............[NO] ............. [NO] .......[NO] [NO] ....... .......[OKAY] ....... [OKAY] -[OKAY] - -[OKAY] - ............stochastic_transformer............ [NO]stochastic_transformer[NO] ............... .[NO] [OKAY][NO][OKAY] - -....... .......[OKAY] -[OKAY]stochastic_transformer -....... [OKAY]fused_lamb -[NO] -[OKAY]. - ............. sparse_attn[NO] ............ .......sparse_attn[NO] [OKAY]................... -sparse_attn [NO] [OKAY] ............ - transformer [NO]transformer ....... ....... ............ ............[OKAY] [OKAY] - [NO] -............transformer ............[NO]............stochastic_transformer [NO][NO]........ ....... [OKAY]....... [NO] - [OKAY] [OKAY] -....... - [OKAY].[OKAY] - transformer -stochastic_transformer stochastic_transformer. [NO]. .......[NO] [OKAY]....... -sparse_attn sparse_attn............ ............[NO] [NO]....... .......sparse_attnsparse_attn[OKAY] -[OKAY]........................ - stochastic_transformer.stochastic_transformer [NO] ......... [NO][NO][OKAY] -fused_lambfused_lambfused_lamb ....................................... [NO][NO][NO] .......sparse_attn.............. [OKAY][OKAY][OKAY]............ -stochastic_transformerstochastic_transformer stochastic_transformer .. .[NO][NO] [NO].............. .......[OKAY][OKAY] - -[OKAY] -stochastic_transformer .. [NO][NO] ....... .......[OKAY] -[OKAY] - ............. [NO]fused_adam fused_adamfused_lamb....... ..........................[OKAY] ............. - stochastic_transformer.......transformer transformer ............[OKAY] ............. -[NO][NO][NO] ..............transformer ....... [OKAY][OKAY]............ - - stochastic_transformer[NO] stochastic_transformer........ [OKAY]. -....... [NO] [OKAY]transformer....... - ............[OKAY] -[NO] .......transformer....... transformer [OKAY][OKAY] - ............ - [OKAY]stochastic_transformer -[NO]transformer transformer................... ............ ............[NO] [OKAY] .......[NO] - [OKAY] - transformer[NO]transformer[NO] ............ ............[NO].............. [NO] .......[OKAY][OKAY] - .............. [OKAY][OKAY] - - - -[NO] ....... [OKAY] -[NO] [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - [OKAY][NO] -[NO] [NO]....... .......[OKAY] -[OKAY] -[NO]transformer ...................transformer [OKAY]sparse_attn[NO] -............ [NO]stochastic_transformer[NO] stochastic_transformer.............. .[OKAY] - stochastic_transformerstochastic_transformer . .[NO]. [NO].......[NO] ..............[OKAY] -[OKAY][OKAY] - - [NO][OKAY]....... -....... [OKAY][OKAY] - - -.......[OKAY]transformer - [OKAY]transformer............ -transformer ............ [NO] sparse_attn.......sparse_attnsparse_attn [OKAY].................................... -fused_lamb -stochastic_transformer ....... .stochastic_transformer [OKAY] [NO] - ........ [OKAY][NO] -............ .......[NO]............ [OKAY].......stochastic_transformer[NO] - [OKAY]. - .[OKAY][NO] -stochastic_transformer [NO] ....... .......stochastic_transformer[OKAY] . . -stochastic_transformer .stochastic_transformer stochastic_transformer[NO] ........ . [NO][NO] [OKAY] ....... - stochastic_transformer............[NO] stochastic_transformer[NO]. ....... ........ [NO] [NO][OKAY] [OKAY] -....... - [NO][NO][NO]stochastic_transformer ....... ..............[OKAY]. - sparse_attn............. ............fused_lamb[NO] .............[NO].......sparse_attn [NO] .......[OKAY] ............ - [OKAY][NO] -stochastic_transformer ....... [OKAY]. - [NO] ....... [OKAY] -....... stochastic_transformer[NO] stochastic_transformer [OKAY]........ . [NO] - [OKAY] [NO]....... - [OKAY][NO] -....... [OKAY][OKAY] - -....... [OKAY][OKAY]stochastic_transformer - - [OKAY][OKAY][NO] - -....... transformer....... [OKAY][OKAY]............ - - ....... [OKAY]transformer[OKAY] - - ............ [NO] ....... [OKAY] - [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - - transformer....... transformer ............transformer [OKAY] [NO]........................ - [NO].......[NO] [OKAY].............. - [NO] .......transformersparse_attn ............[OKAY]............ -stochastic_transformer . [NO] ....... [OKAY] - [OKAY][OKAY] - - [NO] [NO]....... sparse_attnstochastic_transformer.......[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -stochastic_transformer stochastic_transformerstochastic_transformer . ..[NO] [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - - ............[OKAY] .stochastic_transformer [NO] ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at -[NO] ............... transformer [OKAY] [OKAY] -[NO] - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - ................... transformer[OKAY][NO] - - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja - ................... [NO][OKAY] -....... [OKAY] -stochastic_transformer stochastic_transformer. .[NO] [NO]....... .......[OKAY] -[OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] - -[OKAY] ----------------------------------------------------------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op nameop name - - op name................................op name installed................................installed ..installed.. installedcompatible - --------------------------------------------------compatible.. -.. - --------------------------------------------------compatiblecompatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [YES] cpu_adam...... ...............cpu_adam [OKAY][YES] -cpu_adam .................................... [OKAY][YES][YES] - ............fused_adam [OKAY][OKAY]............. - -[NO] ....... fused_adam[OKAY] -............. [NO]fused_lamb ....... fused_adam fused_adam.............[OKAY] - [NO].......................... .......[NO][NO] fused_lamb [OKAY]........................... - [OKAY] [NO] - [OKAY]....... - [OKAY]fused_lamb - .............fused_lamb [NO]............. sparse_attn[NO] ....... ............ ....... [OKAY] [NO] -sparse_attn [OKAY] ....... - ............[OKAY] -[NO] .......transformer [OKAY]............ - [NO]sparse_attn transformer....... ............sparse_attn............[OKAY] -[NO] ............[NO]stochastic_transformer....... .......[NO] .[OKAY] [OKAY] - ....... -[NO] transformer [OKAY] ....... -............stochastic_transformer [OKAY]transformer[NO] -. ....... ............ [NO] [OKAY][NO]....... - .......[OKAY] -stochastic_transformer[OKAY] -. [NO]stochastic_transformer ........ [OKAY][NO] - ....... [OKAY] -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name -op name op name ................op name ................ ................ installed ................ installedinstalled .. ..installedcompatible .. -compatible.. --------------------------------------------------- compatible-------------------------------------------------- - -compatible - --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam............... cpu_adam...............[YES]cpu_adam [YES].................................... ...... [YES] [YES][OKAY][OKAY] - - ............ [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] fused_adam .......fused_adam ....... .............[OKAY].............[OKAY] - -[NO][NO] .............. fused_lamb fused_lamb [OKAY][OKAY]............. - - .............[NO] fused_lamb[NO] .......fused_lamb.................... [OKAY].............[OKAY][NO] - - [NO]....... .......[OKAY] -[OKAY] -sparse_attnsparse_attn ........................ [NO][NO] ..............sparse_attn sparse_attn ............[OKAY][OKAY] ............ -[NO] - [NO]....... transformer .......transformer[OKAY] -[OKAY] ............ -............ transformer [NO]transformer[NO] .......................... ............ [NO][OKAY] [OKAY] -[NO] - .............. [OKAY]stochastic_transformer[OKAY] - stochastic_transformer - . stochastic_transformer.[NO]stochastic_transformer [NO]....... . ........[OKAY] [OKAY] -[NO][NO] - .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninja ...................................................... ninja [OKAY][OKAY]..................[OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------ -[OKAY] - - -op nameop name op name --------------------------------------------------................ ................ - ................ installed op nameinstalled installed.................... ..installedcompatiblecompatible - -compatible--------------------------------------------------..-------------------------------------------------- - - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adam cpu_adam............... ...............cpu_adam[YES] [YES]cpu_adam .......................................... [OKAY][OKAY] [YES] - -[YES] ............ [OKAY][OKAY] - -fused_adamfused_adam ..........................fused_adam fused_adam[NO] [NO] .......................... .............. [NO] [NO][OKAY] [OKAY] - ....... -....... fused_lamb[OKAY][OKAY] - -fused_lamb............. fused_lamb[NO]fused_lamb .......................... ....... ............. [NO][NO][OKAY][NO] - ..................... [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO]sparse_attn sparse_attn....... sparse_attn ........................[OKAY]............ [NO][NO] -[NO] ..............transformer ....... ............[OKAY][OKAY] - -[NO][OKAY] -.......transformertransformer [OKAY]transformer............ -............ ............[NO][NO] stochastic_transformer [NO] .............. ........[OKAY][OKAY] -[NO] -[OKAY] -....... stochastic_transformer[OKAY]stochastic_transformer stochastic_transformer - . ..[NO] [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja --------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. ....................................[OKAY] [OKAY] - -[OKAY][OKAY]-------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op name --------------------------------------------------- - op name -................op name op name................ installed ................................ installed .. compatibleinstalledinstalled - .. ..-------------------------------------------------- ..compatiblecompatible - - -compatible ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [YES] cpu_adam cpu_adam...............cpu_adam ...... [YES]............... ............... ...... [YES] [YES][OKAY]...... [OKAY] ...... -[OKAY] - -[OKAY] -fused_adamfused_adam fused_adam ............. fused_adam.............[NO]............. ....................[NO] [NO] [NO][OKAY]....... ....... - ....... [OKAY] fused_lamb -[OKAY][OKAY] - -.............fused_lamb [NO]fused_lambfused_lamb ............. ....... .............[OKAY] .............[NO] -[NO]....... [NO] ....... [OKAY] -.......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY]sparse_attn - ............transformersparse_attn [NO]........................ [NO].......[NO] sparse_attn ....... [OKAY]....... ............ - [OKAY][OKAY] - transformer -[NO] transformer............stochastic_transformer ....... ............[NO]. [OKAY] [NO][NO] -....... ....... ....... [OKAY][OKAY] - -transformer[OKAY] stochastic_transformer -............ [NO]. stochastic_transformer ....... [NO] . ....... [OKAY][NO] - [OKAY]....... - [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... ..................[OKAY] [OKAY] - -[OKAY][OKAY]-------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op name -op name-------------------------------------------------- op name - ................ ................ op name................ installed installed................ ....installedinstalled compatible..compatible -.. - --------------------------------------------------compatible-------------------------------------------------- - -compatible --------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam............... cpu_adam............... ............... [YES]............... [YES] ...... [YES][YES] ...... [OKAY]............[OKAY] - -[OKAY][OKAY] - -fused_adam .............fused_adam [NO]fused_adam.............fused_adam ....... ............. [NO] [OKAY].............[NO]....... - [NO].......[OKAY] -.......fused_lamb[OKAY] -[OKAY]fused_lamb............. - fused_lamb.............[NO] ....................fused_lamb[NO] .......[NO] [OKAY] ............. -[OKAY] ....... -[NO] [OKAY]....... - [OKAY] -sparse_attn sparse_attn............ ............[NO] [NO]....... sparse_attn.......[OKAY]sparse_attn - [OKAY]........................ - [NO][NO]transformer transformer.......................... [OKAY][NO]............ -[OKAY] -.......[NO] transformer[OKAY].......transformer - ............ [OKAY]............ - [NO]stochastic_transformer[NO] ........stochastic_transformer ....... [OKAY].[OKAY][NO] - ....... - stochastic_transformer[OKAY][NO] - stochastic_transformer....... . [OKAY]. -[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY] -[OKAY][OKAY] --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- -op name - --------------------------------------------------op name................op name - ................................installedop name ..installed................ installed compatible installed.. - .. ..-------------------------------------------------- -compatible compatible -compatible - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [YES] ...... [OKAY]cpu_adamcpu_adam -cpu_adam ............................................. [YES][YES][YES] ...... ......fused_adam ...... [OKAY][OKAY] ............. - -[OKAY][NO] -....... [OKAY] -fused_lamb fused_adam............. fused_adam fused_adam............. [NO] ............. .............[NO] ....... [NO][NO] .......[OKAY] ....... -.......[OKAY][OKAY] - -[OKAY] -fused_lambfused_lamb fused_lamb.......................... ............. [NO]sparse_attn [NO] [NO].......................... ....... [NO][OKAY] -[OKAY][OKAY]....... - - [OKAY] -transformer ............ [NO] sparse_attn....... ............[OKAY]sparse_attn -sparse_attn [NO]........................stochastic_transformer .......[NO][NO] .[OKAY].............. -[NO] [OKAY] [OKAY]transformer -....... - transformer............[OKAY] -............transformer[NO] [NO]................... .......[NO][OKAY] -[OKAY]....... - [OKAY]stochastic_transformer - stochastic_transformer . stochastic_transformer.[NO] .[NO]....... [NO][OKAY]....... - .......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... ....................................[OKAY][OKAY] - -[OKAY]--------------------------------------------------[OKAY] --------------------------------------------------- - - ---------------------------------------------------op nameop name --------------------------------------------------- -op name................................ op name installed................ installed ................ ..installed ..compatibleinstalled.. - ..compatible-------------------------------------------------- -compatible - ---------------------------------------------------compatible - --------------------------------------------------- --------------------------------------------------- -cpu_adamcpu_adamcpu_adamcpu_adam ............................................................ [YES][YES][YES][YES] ...... ......[OKAY]............ - [OKAY][OKAY][OKAY] - - -fused_adam .............fused_adam fused_adamfused_adam [NO] ............. ............. ....... ............. [NO][NO][NO] [OKAY] ....... ....... - ....... [OKAY][OKAY] -fused_lamb -[OKAY] -............. [NO] fused_lamb.......fused_lambfused_lamb ..........................[OKAY]............. - [NO][NO][NO] ....... .............. [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO] ....... [OKAY] -transformersparse_attnsparse_attn ........................ sparse_attn............ [NO] [NO] [NO]....... .......[OKAY]................... - [OKAY][OKAY] - -stochastic_transformer[NO] transformertransformer....... .........................[OKAY] - [NO][NO][NO]transformer .............. ....... ............[OKAY] [OKAY] -[NO][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - --------------------------------------------------- -.......stochastic_transformer stochastic_transformer [OKAY] -.. [NO] [NO]....... .......[OKAY]stochastic_transformer - [OKAY] -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------JIT compiled ops requires ninja - - -. [NO] ....... [OKAY] -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninja ....................................ninja .................. [OKAY] [OKAY] - .................. - --------------------------------------------------[OKAY][OKAY]-------------------------------------------------- - - - -op name-------------------------------------------------- --------------------------------------------------op name................ - - ................installedop nameop name .................. installed compatible................ installed - .. --------------------------------------------------..installed - compatiblecompatible - -..---------------------------------------------------------------------------------------------------- - -cpu_adamcompatible -............... [YES]-------------------------------------------------- -...... cpu_adam[OKAY] cpu_adam -............... ............... [YES][YES] ............cpu_adam fused_adam[OKAY][OKAY]............... - - [YES]............. [NO]...... ....... [OKAY][OKAY] - -fused_adamfused_adamfused_lamb ............. ..........................[NO] [NO][NO]....... .......[OKAY].......fused_adam - [OKAY].............[OKAY]fused_lamb - - .............[NO]fused_lamb [NO] .................... ....... [NO] [OKAY] [OKAY]sparse_attn....... - - ............[OKAY] -fused_lamb[NO] .................... [OKAY] -[NO] transformer....... ............[OKAY] sparse_attn[NO] - sparse_attn................... ............[NO][OKAY] - [NO]....... stochastic_transformer [OKAY] ....... - .[OKAY] -transformer[NO]sparse_attn ...................transformer ............ [NO] [OKAY]............[NO]....... - [OKAY][NO]....... - .......[OKAY] [OKAY] -stochastic_transformer - transformer. stochastic_transformer[NO] ............ ........ [NO][OKAY] [NO] -....... ....... [OKAY][OKAY] - -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_iotransformer_inference ................. [NO][NO] .............. [OKAY][NO] - -async_io ............... utils[NO] ......................... [YES][NO] -...... transformer_inference[OKAY] -.. [NO] .......quantizer [OKAY].............. - [NO] ....... [OKAY]utilstransformer_inference - .................... [YES][NO] --------------------------------------------------............. - [OKAY][OKAY] - -quantizer .............. [NO]utils ......................... [OKAY][YES] - ...... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - ............... [NO] ....... [NO] -async_io ............... [NO] .......transformer_inference [NO].. - [NO] ....... [OKAY] -utils .................. [YES]transformer_inference ........ [OKAY][NO] - ....... quantizer[OKAY] -.............. [NO] ....... [OKAY] -utils .................. [YES] --------------------------------------------------...... - [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference async_io.. [NO] ...................... [NO][OKAY] -....... [NO] -utils .................. [YES] ...... [OKAY] -transformer_inferencequantizer ................ [NO][NO] .............. [OKAY][OKAY] - ---------------------------------------------------utils - .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io [WARNING]  async_io: please install the libaio-devel package with yum ............... [NO] -.......transformer_inference [NO].. - [NO] ....... [OKAY] -utils .................. transformer_inference[YES] ........ [OKAY][NO] - ....... [OKAY]quantizer - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - .............. [NO] .......utils [OKAY].................. - [YES] ...... [OKAY]-------------------------------------------------- - -async_io ............... [NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -async_io ............... [NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io -transformer_inference .. [NO] ....... [OKAY] - ............... [NO] ....... [NO] -async_io ............... [NO]transformer_inference ......... [NO][NO] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - ....... [OKAY] --------------------------------------------------- -utilstransformer_inference .................... [YES][NO] ............. [OKAY][OKAY] - -quantizer .............. utils[NO] ......................... [YES][OKAY] -...... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference utils.. ..................[NO] [YES]....... ......[OKAY] -[OKAY] -quantizer ..............utils [NO].................. .......[YES] [OKAY]...... - [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. --------------------------------------------------- -async_io ............... [NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -utils .................. [YES] ...... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[YES] ......[OKAY] -[OKAY] -quantizer .............. [NO] ....... [OKAY] -quantizer utils.............. ..................[NO] [YES]....... ......[OKAY] -[OKAY] --------------------------------------------------- ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -............... - [NO] ....... [NO] -async_ioasync_io ............... [NO]transformer_inference............... .........[NO] [NO].......[NO] -.......[NO] -[OKAY] -utils .................. [YES] ...... transformer_inference[OKAY] -transformer_inference.. ..[NO]quantizer [NO]..................... .......[OKAY][NO] - [OKAY]....... - [OKAY] -utils utils..................-------------------------------------------------- -..................[YES] [YES]...... ......[OKAY] -[OKAY] -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -utils .................. [YES] ...... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... async_io[NO] - ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] .......utils [OKAY].................. - [YES] ...... [OKAY] -utils .................. [YES]quantizer .................... [OKAY][NO] - ....... [OKAY]quantizer - .............. [NO] --------------------------------------------------....... - [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -DeepSpeed general environment info: -transformer_inference .. [NO] ....... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -utils .................. [YES] ...... [OKAY] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io async_io............... [NO]............... .......[NO] [NO]....... - [NO] -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io ............... [NO] .......transformer_inference [NO].. - [NO] ....... [OKAY] -utils transformer_inference.................. ..[YES] [NO]...... .......[OKAY] -[OKAY] -quantizer .............. utils[NO] ......................... [YES][OKAY] -...... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] ....... [NO] -async_io transformer_inference............... ..[NO] [NO]....... .......[NO] -[OKAY] -utils .................. [YES] ...... [OKAY]transformer_inference - .. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] -utils --------------------------------------------------.................. - [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [YES] ...... [OKAY] -quantizer async_io.............. [NO]............... .......[NO] [OKAY]....... - [NO] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO]utils ......................... [OKAY][YES] - ...... [OKAY] -utils quantizer.................. ..............[YES] [NO]...... .......[OKAY] -[OKAY] -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io async_io............... [NO]............... .......[NO] [NO]....... -[NO] -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [YES] ......utils [OKAY].................. - [YES] ......quantizer [OKAY].............. - [NO] .......quantizer [OKAY].............. - [NO] .......-------------------------------------------------- -[OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -utils .................. [YES] ...... [OKAY] -async_ioquantizer ............................. [NO] [NO]....... ....... [NO] -[OKAY] -transformer_inference --------------------------------------------------.. - [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -....... [NO] -async_iotransformer_inference ................. [NO][NO] .............. [NO][OKAY] - -utils .................. [YES] ......transformer_inference [OKAY].. - [NO] ....... [OKAY]quantizer - .............. [NO] ....... utils[OKAY] -.................. [YES] ......-------------------------------------------------- -[OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+cd7967d, cd7967d, master -0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -DeepSpeed general environment info: -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch versiontorch version ........................................ 1.8.11.8.1 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -torch cuda version torch cuda version............... ...............11.1 -11.1 -nvcc version nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - ...................deepspeed info 0.5.5+cd7967d, cd7967d, master................... - 0.5.5+cd7967d, cd7967d, masterdeepspeed wheel compiled w. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -......deepspeed wheel compiled w. DeepSpeed general environment info:torch 1.8, cuda 11.1 -...... -torch 1.8, cuda 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -DeepSpeed general environment info: -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -DeepSpeed general environment info: -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... DeepSpeed general environment info:['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed info ................... 0.5.5+cd7967d, cd7967d, mastertorch install path -deepspeed wheel compiled w. ..................... torch 1.8, cuda 11.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -DeepSpeed general environment info: -torch version .................... 1.8.1 -DeepSpeed general environment info: -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -nvcc version ..................... 11.2 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -torch version .................... 1.8.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch cuda version ............... 11.1 -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -DeepSpeed general environment info: -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -DeepSpeed general environment info:DeepSpeed general environment info: - -torch version .................... 1.8.1 -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -torch version .................... 1.8.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -torch versiontorch cuda version ................................... 1.8.111.1 - -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -nvcc version torch cuda version..................... ...............11.2 -11.1deepspeed install path -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']nvcc version - .....................deepspeed info 11.2................... - 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. deepspeed install path...... ...........torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................torch version 1.8.1.................... - 1.8.1 -torch cuda version ...............torch cuda version 11.1............... - nvcc version11.1 -.....................nvcc version 11.2..................... - deepspeed install path11.2 -...........deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -async_io ............... [NO] ....... [NO] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ............... [NO] ....... [NO]transformer_inference - .. [NO] ....... [OKAY] -transformer_inferenceutils .................... [NO][YES] ............. [OKAY][OKAY] - -DeepSpeed general environment info: -quantizer utils.............. ..................[NO] [YES]....... ......[OKAY] -[OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 --------------------------------------------------- -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -DeepSpeed general environment info: -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -DeepSpeed general environment info: -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch versionDeepSpeed general environment info: .................... 1.8.1 - -torch cuda version ............... torch install path11.1 -nvcc version .................................... 11.2 -deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']torch version - deepspeed info.................... ................... 1.8.10.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.torch cuda version ..................... torch 1.8, cuda 11.111.1 - -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch version .................... 1.8.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -torch cuda version ............... 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -torch cuda version ............... 11.1 -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version ...............torch cuda version 11.1............... - nvcc version11.1 -.....................nvcc version 11.2..................... - deepspeed install path11.2 -...........deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+cd7967d, cd7967d, master -0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -DeepSpeed general environment info:DeepSpeed general environment info: - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -.................... 1.8.1 -torch versiontorch cuda version ................................... 11.1 -1.8.1nvcc version - ..................... torch cuda version11.2 -deepspeed install path............... ........... 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']nvcc version - deepspeed info ........................................ 0.5.5+cd7967d, cd7967d, master11.2 - -deepspeed wheel compiled w. deepspeed install path...... torch 1.8, cuda 11.1........... - ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2DeepSpeed general environment info: -deepspeed install path -........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -torch install pathdeepspeed info .................................. 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']...... - torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -nvcc version ..................... 11.2 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -nvcc version ..................... 11.2 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -DeepSpeed general environment info: -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch cuda version ............... 11.1 -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc version ..................... 11.2 -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ...............torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch version - .................... 1.8.1torch version - .................... torch cuda version1.8.1 -............... 11.1torch cuda version - nvcc version............... .....................11.1 -11.2nvcc version - deepspeed install path..................... ...........11.2 -deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']........... - deepspeed info ...................['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -0.5.5+cd7967d, cd7967d, master -deepspeed info deepspeed wheel compiled w.................... ......0.5.5+cd7967d, cd7967d, master -torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch versionDeepSpeed general environment info: .................... 1.8.1 - -torch cuda version ...............torch install path 11.1 -............... nvcc version ..................... 11.2 -deepspeed install path['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -........... torch version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'].................... - deepspeed info1.8.1 -................... 0.5.5+cd7967d, cd7967d, mastertorch cuda version - deepspeed wheel compiled w................ ......11.1 -torch 1.8, cuda 11.1nvcc version - ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1DeepSpeed general environment info: -torch cuda version - ............... 11.1 -nvcc versiontorch install path .................................... 11.2 -deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed infotorch version ....................................... 0.5.5+cd7967d, cd7967d, master1.8.1 - -deepspeed wheel compiled w. torch cuda version...... ...............torch 1.8, cuda 11.1 -11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install pathtorch version .................... ...............1.8.1 -torch cuda version ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']11.1 - -nvcc version torch version..................... 11.2.................... - deepspeed install path1.8.1 -........... torch cuda version ...............['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -11.1deepspeed info - nvcc version................... .....................0.5.5+cd7967d, cd7967d, master -11.2deepspeed wheel compiled w. - deepspeed install path...... ...........torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... DeepSpeed general environment info:11.2 -deepspeed install path - ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']torch install path -deepspeed info .................................. 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ......['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1587010.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 55 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.02 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... False - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -> setting tensorboard ... -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY] [OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -op nameop nameop name op name ................ ................ ................ ................installed installed installedinstalled.... ..compatiblecompatible.. - - compatible--------------------------------------------------compatible-------------------------------------------------- - - - --------------------------------------------------- --------------------------------------------------- -cpu_adamcpu_adam cpu_adam ............... cpu_adam[YES]............... ............... ............... ......[YES] [YES] [OKAY][YES] - ...... ............[OKAY] [OKAY] -[OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_adamfused_adamfused_adam .............fused_lamb............. .............[NO] ....................[NO][NO] [NO] [OKAY]..................... - [OKAY] [OKAY] -[OKAY] -fused_lamb - .............fused_lamb fused_lamb[NO] ................................. sparse_attn [NO][OKAY] [NO] - .......................... [NO][OKAY][OKAY] - -....... [OKAY] -transformersparse_attn ........................ [NO][NO] .............. sparse_attnsparse_attn [OKAY] [OKAY] -........................ - [NO][NO]stochastic_transformertransformer ........................... [OKAY][NO][OKAY][NO] - - .......transformer .......[OKAY] -[OKAY]transformer............ - ............[NO]stochastic_transformer [NO]........ .......[OKAY][NO] - [OKAY] -....... stochastic_transformer[OKAY] stochastic_transformer - . .[NO] [NO] .............. [OKAY][OKAY] - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utils .................. [YES]transformer_inference ........ [OKAY][NO] - ....... [OKAY] -quantizer .............. [NO] utils....... ..................[OKAY] -[YES] ...... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -DeepSpeed general environment info:torch version -.................... 1.8.1 -torch cuda version torch install path............... 11.1............... - nvcc version ..................... 11.2 -deepspeed install path['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -........... torch version['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -....................deepspeed info 1.8.1 -................... 0.5.5+cd7967d, cd7967d, mastertorch cuda version - ...............deepspeed wheel compiled w. 11.1...... - torch 1.8, cuda 11.1nvcc version - ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-18 04:28:14,983] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/data' -g++ -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color -I/gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -I/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/pybind11/include helpers.cpp -o helpers.cpython-38-x86_64-linux-gnu.so -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/data' ->>> done with dataset index builder. Compilation time: 6.291 seconds -WARNING: constraints for invoking optimized fused softmax kernel are not met. We default back to unfused kernel invocations. -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -[1/3] c++ -MMD -MF layer_norm_cuda.o.d -DTORCH_EXTENSION_NAME=fused_mix_prec_layer_norm_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/TH -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/THC -isystem /gpfslocalsys/cuda/11.2/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -c /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/fused_kernels/layer_norm_cuda.cpp -o layer_norm_cuda.o -[2/3] /gpfslocalsys/cuda/11.2/bin/nvcc --generate-dependencies-with-compile --dependency-output layer_norm_cuda_kernel.cuda.o.d -DTORCH_EXTENSION_NAME=fused_mix_prec_layer_norm_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/TH -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/THC -isystem /gpfslocalsys/cuda/11.2/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 -gencode arch=compute_70,code=sm_70 --use_fast_math -maxrregcount=50 -gencode arch=compute_80,code=sm_80 -std=c++14 -c /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/fused_kernels/layer_norm_cuda_kernel.cu -o layer_norm_cuda_kernel.cuda.o -[3/3] c++ layer_norm_cuda.o layer_norm_cuda_kernel.cuda.o -shared -L/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/gpfslocalsys/cuda/11.2/lib64 -lcudart -o fused_mix_prec_layer_norm_cuda.so -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 25.466 seconds -time to initialize megatron (seconds): 94.777 -[after megatron is initialized] datetime: 2021-10-18 04:28:46 -building GPT model ... -[2021-10-18 04:28:46,846] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-18 04:28:46,847] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-18 04:28:46,847] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.54 GB, percent = 21.1% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-18 04:28:48,522] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 -[2021-10-18 04:28:49,231] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-18 04:28:49,231] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-18 04:28:49,232] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.71 GB, percent = 21.2% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-18 04:28:49,232] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+cd7967d, git-hash=cd7967d, git-branch=master -[2021-10-18 04:28:49,271] [INFO] [engine.py:204:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-18 04:28:49,271] [INFO] [engine.py:848:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-18 04:28:49,271] [INFO] [engine.py:854:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-18 04:28:49,272] [INFO] [engine.py:870:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-18 04:28:49,272] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-18 04:28:49,272] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-18 04:28:49,272] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-18 04:28:49,272] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-18 04:28:49,272] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-18 04:28:49,272] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -[2021-10-18 04:28:51,104] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -[2021-10-18 04:28:51,105] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -[2021-10-18 04:28:51,105] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.74 GB, percent = 21.2% -[2021-10-18 04:28:51,159] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-18 04:28:51,159] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-18 04:28:51,160] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.74 GB, percent = 21.2% -[2021-10-18 04:28:51,160] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-18 04:28:51,189] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-18 04:28:51,189] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-18 04:28:51,190] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.74 GB, percent = 21.2% -[2021-10-18 04:28:51,190] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-18 04:28:51,190] [INFO] [engine.py:596:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-18 04:28:51,190] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-18 04:28:51,190] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-18 04:28:51,190] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-18 04:28:51,190] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-18 04:28:51,191] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-18 04:28:51,192] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-18 04:28:51,192] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -[2021-10-18 04:28:51,193] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,583] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:28:51,672] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,672] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,672] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,672] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -WARNING: could not find the metadata file /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - will not load any checkpoints and will start from random -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,673] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,674] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,675] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,676] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,676] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,676] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,676] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,677] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,677] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,677] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:28:51,677] [WARNING] [engine.py:1981:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -time (ms) | load-checkpoint: 5.46 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504estimated model parameters: 125.2213504 - -estimated model parameters: 125.2213504 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 125.22432 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-18 04:28:51 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 3000320 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.127187 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.262 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.156 seconds - total number of samples: 6927161 - total number of epochs: 1 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.056 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-18 04:28:57 -done with setup ... -training ... -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billionNumber of parameters: 125.2213504 billion - -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -time (ms) | model-and-optimizer-setup: 4896.38 | train/valid/test-data-iterators-setup: 5425.07 -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 125.22432 billionNumber of parameters: 125.22432 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billionNumber of parameters without embeddings: 103.368064 billion - -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-18 04:28:57 -[2021-10-18 04:28:57,799] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-18 04:28:57,799] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-18 04:28:57,799] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-18 04:28:57,799] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-18 04:28:57,799] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -slurmstepd: error: *** STEP 1587010.0 ON r6i4n4 CANCELLED AT 2021-10-18T04:42:03 *** -Killing subprocess 2635825 -Killing subprocess 2063452 -Killing subprocess 756094 -Killing subprocess 3068823 -Killing subprocess 2635826 -Killing subprocess 756095 -Killing subprocess 2063453 -srun: Job step aborted: Waiting up to 62 seconds for job step to finish. -Killing subprocess 2635827 -Killing subprocess 715627 -Killing subprocess 756096 -Killing subprocess 2635829 -Killing subprocess 3068824 -Killing subprocess 756097 -Main process received SIGTERM, exiting -Killing subprocess 715628 -Killing subprocess 715629 -Killing subprocess 3068825 -Killing subprocess 715630 -Killing subprocess 1542999 -Killing subprocess 3068826 -Killing subprocess 2859693 -Main process received SIGTERM, exiting -Killing subprocess 2859694 -Main process received SIGTERM, exiting -Killing subprocess 1543000 -Killing subprocess 2063454 -Killing subprocess 2063455 -Main process received SIGTERM, exiting -Killing subprocess 2859695 -Main process received SIGTERM, exiting -Killing subprocess 2859697 -Killing subprocess 1543001 -Killing subprocess 1543003 -Killing subprocess 1539704 -Main process received SIGTERM, exiting -Killing subprocess 1539705 -Killing subprocess 1542563 -Main process received SIGTERM, exiting -Killing subprocess 1539706 -Killing subprocess 1542564 -Killing subprocess 3179848 -Killing subprocess 1539707 -Killing subprocess 1542565 -Main process received SIGTERM, exiting -Killing subprocess 3179849 -Killing subprocess 1555847 -Killing subprocess 2864365 -Killing subprocess 3179850 -Killing subprocess 1545594 -Killing subprocess 2864366 -Killing subprocess 1555848 -Killing subprocess 1542567 -Killing subprocess 3179851 -Main process received SIGTERM, exiting -Killing subprocess 1545595 -Killing subprocess 1555849 -Killing subprocess 395964 -Killing subprocess 2864367 -Killing subprocess 2864369 -Killing subprocess 1555850 -Killing subprocess 1550099 -Main process received SIGTERM, exiting -Killing subprocess 1545596 -Killing subprocess 3395959 -Killing subprocess 1542944 -Killing subprocess 4108363 -Killing subprocess 1543631 -Killing subprocess 395965 -Killing subprocess 1287928 -Killing subprocess 393464 -Killing subprocess 1550100 -Killing subprocess 376841 -Killing subprocess 19464 -Killing subprocess 626626 -Killing subprocess 4108364 -Main process received SIGTERM, exiting -Killing subprocess 3395960 -Killing subprocess 567183 -Killing subprocess 1543632 -Killing subprocess 1816835 -Killing subprocess 1542945 -Killing subprocess 393465 -Killing subprocess 4002214 -Killing subprocess 395966 -Killing subprocess 1649390 -Killing subprocess 481710 -Killing subprocess 376842 -Killing subprocess 1550101 -Killing subprocess 1542946 -Killing subprocess 4108365 -Killing subprocess 1287929 -Killing subprocess 1543633 -Main process received SIGTERM, exiting -Killing subprocess 1816836 -Killing subprocess 626627 -Killing subprocess 1934174 -Killing subprocess 1287930 -Killing subprocess 3395961 -Killing subprocess 4002215 -Killing subprocess 1649391 -Killing subprocess 19465 -Killing subprocess 3395962 -Killing subprocess 567184 -Killing subprocess 1543634 -Killing subprocess 4108367 -Killing subprocess 393466 -Killing subprocess 1649392 -Killing subprocess 626628 -Main process received SIGTERM, exiting -Killing subprocess 393467 -Killing subprocess 1545597 -Killing subprocess 4002216 -Killing subprocess 481711 -Main process received SIGTERM, exiting -Killing subprocess 567185 -Killing subprocess 355930 -Killing subprocess 1542947 -Killing subprocess 376843 -Main process received SIGTERM, exiting -Killing subprocess 626630 -Killing subprocess 1287932 -Killing subprocess 4002218 -Killing subprocess 1816837 -Killing subprocess 376844 -Killing subprocess 1934175 -Killing subprocess 1816839 -Killing subprocess 19466 -Killing subprocess 1649393 -Killing subprocess 19467 -Main process received SIGTERM, exiting -Killing subprocess 355931 -Killing subprocess 395967 -Main process received SIGTERM, exiting -Killing subprocess 1550102 -Killing subprocess 355932 -Main process received SIGTERM, exiting -Killing subprocess 481712 -Killing subprocess 2202614 -Killing subprocess 1934176 -Killing subprocess 481713 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 355934 -Killing subprocess 1934177 -Killing subprocess 567186 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 2202615 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 2202616 -Killing subprocess 2202617 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................................... [OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -op name op name................ ................installedninja ..installed.................. compatible..[OKAY] - ---------------------------------------------------compatible --------------------------------------------------- - --------------------------------------------------- -op name ................ installedcpu_adam ................. compatiblecpu_adam -[YES] -------------------------------------------------- -..................... [YES][OKAY] - cpu_adam...... ............... [OKAY][YES] - ...... fused_adam[OKAY] -............. [NO] ....... [OKAY]fused_adam - .............fused_adam fused_lamb [NO].......................... .......[NO][NO] .......[OKAY]....... - [OKAY][OKAY] - -fused_lamb fused_lamb............. ............. [NO][NO] .............. [OKAY][OKAY]sparse_attn - -............ [NO] ....... [OKAY] -transformer ............ sparse_attnsparse_attn[NO] ............................... [NO][OKAY] -[NO]....... stochastic_transformer[OKAY]....... - [OKAY]. - transformer[NO] transformer................... ............[NO] [OKAY] ....... -[NO] [OKAY]....... - [OKAY] -stochastic_transformer stochastic_transformer. [NO]. ....... [NO][OKAY] -....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer --------------------------------------------------. [NO] -.......DeepSpeed C++/CUDA extension op report -[OKAY]-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. [OKAY] -cpu_adam --------------------------------------------------............... - [YES]op name ...................... [OKAY]installed - .. compatible --------------------------------------------------- -fused_adam .............cpu_adam [NO]............... .......[YES] ......[OKAY] -[OKAY] ---------------------------------------------------fused_lamb ............. - DeepSpeed C++/CUDA extension op report[NO] - --------------------------------------------------fused_adam....... - .............NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.[OKAY] -[NO] --------------------------------------------------- -.......JIT compiled ops requires ninja -[OKAY] -fused_lamb ............. sparse_attn[NO] ................... [OKAY][NO] - ....... [OKAY] -transformer ............ [NO] ....... [OKAY]sparse_attn - ............ [NO] .......stochastic_transformer [OKAY] -. [NO]transformer ................... [NO][OKAY] -....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninja - - -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ...............ninja [YES] ........................ [OKAY][OKAY] - --------------------------------------------------- -op name ................ installedninja .. fused_adam..................compatible -.............[OKAY] -------------------------------------------------- - -[NO] --------------------------------------------------....... - [OKAY]op name - ................ cpu_adamfused_lambinstalled .............................. [YES][NO]compatible -............. -------------------------------------------------- [OKAY] -[OKAY] - -cpu_adam ............... fused_adam[YES] .............sparse_attn...... [NO]............[OKAY] -.......[NO] [OKAY]....... - [OKAY] -fused_lamb .............transformer [NO]............ fused_adam.......[NO] .............[OKAY]....... - [NO][OKAY] -....... [OKAY] -stochastic_transformer .fused_lamb [NO]............. .......sparse_attn[NO] [OKAY]................... - [NO][OKAY] -....... [OKAY] -transformer ............ [NO] ....... [OKAY] -sparse_attnstochastic_transformer ............ [NO]. .......[NO] [OKAY]....... - [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ninja............... [YES].................. ......[OKAY] [OKAY] - --------------------------------------------------- -op name ................ installed .. compatible ---------------------------------------------------fused_adam - ............. [NO] ....... [OKAY] -cpu_adam ............... fused_lamb[YES] ................... [NO][OKAY] -....... [OKAY] -fused_adam ............. [NO] .......sparse_attn [OKAY] -............ [NO] .......fused_lamb [OKAY]............. - [NO] ....... transformer[OKAY] -............ [NO] ....... [OKAY] -stochastic_transformer .sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninja .................. [OKAY] -sparse_attn ............-------------------------------------------------- -[NO] op name....... ................[OKAY] -transformer ............ installed[NO] ....... [OKAY] - stochastic_transformer.. compatible. - --------------------------------------------------[NO] ....... [OKAY] - -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer ninja. [NO].................. [OKAY] - .......-------------------------------------------------- -[OKAY] -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer ninja. ..................[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [YES][YES] ............ [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambfused_lamb ............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . ninja[NO] ....... ..................[OKAY] -[OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] --------------------------------------------------- -sparse_attn ............ [NO] ....... [OKAY] -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. ninja[NO] ......................... [OKAY][OKAY] - --------------------------------------------------- -op namefused_lamb ............................. installed[NO] ......... compatible[OKAY] - --------------------------------------------------- -cpu_adam ............... sparse_attn[YES] .................. [NO][OKAY] -....... [OKAY] -transformer ............ [NO] fused_adam....... .............[OKAY] -[NO] ....... [OKAY]stochastic_transformer - . [NO]fused_lamb .................... [OKAY][NO] - ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adamninja ............. ..................[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -op namefused_lamb ............................. [NO]installed ......... [OKAY]compatible - --------------------------------------------------- -cpu_adam ...............sparse_attn [YES]............ ......[NO] [OKAY]....... - [OKAY] -transformer ............ [NO] ....... [OKAY]fused_adam - ............. [NO] stochastic_transformer....... [OKAY] -. [NO] fused_lamb....... .............[OKAY] -[NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [YES][YES] ...... ......[OKAY] -[OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installedninja .. ..................compatible -[OKAY]-------------------------------------------------- - --------------------------------------------------- -op name ................ installed cpu_adam.. ...............compatible -[YES] --------------------------------------------------...... - [OKAY] -cpu_adam ............... [YES] ...... fused_adam[OKAY] -............. [NO] ....... [OKAY] -fused_lamb .............fused_adam [NO]............. [NO]....... .......[OKAY] -[OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -transformer sparse_attn............ ............[NO] .......[NO] [OKAY]....... -ninja .................. [OKAY] --------------------------------------------------- -JIT compiled ops requires ninja - [OKAY] -op name ................ installed .. compatible -stochastic_transformer transformer ............. [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -stochastic_transformer . [NO] ....... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] --------------------------------------------------- -fused_adam ............. [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -fused_lamb ............. [NO] ....... [OKAY] -cpu_adam ............... [YES]ninja ........................ [OKAY][OKAY] - --------------------------------------------------- -op name ................ installed .. fused_adamcompatible - .............-------------------------------------------------- -sparse_attn ............ [NO] ....... [OKAY] -[NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -cpu_adamfused_lamb ............................ [YES] [NO]...... .......[OKAY] - [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -fused_adam ............. [NO]sparse_attn ................... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -[NO] ....... [OKAY] --------------------------------------------------- -JIT compiled ops requires ninja -fused_lamb .............transformer [NO]............ ....... [NO][OKAY] - ....... [OKAY] -stochastic_transformer . [NO] .......sparse_attn [OKAY]............ - [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. [OKAY] -cpu_adam ............... [YES] ...... [OKAY] --------------------------------------------------- -fused_adam ............. [NO] ....... [OKAY] -op name ................ installed .. compatible -fused_lamb ............. [NO] ....... [OKAY] --------------------------------------------------- -sparse_attn ............ [NO] ....... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -transformer ............ [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -ninja .................. [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -ninja .................. [OKAY] -fused_lamb ............. [NO] ....... [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -sparse_attn ............ [NO] ....... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -transformer ............ [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninja .................................... [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- -op name op name................ ................installed installed.. ..compatible -compatible ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [YES][YES] ............ [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb .............fused_lamb [NO]............. .......[NO] [OKAY]....... - [OKAY] -sparse_attn ............ sparse_attn[NO] ................... [NO][OKAY] -....... [OKAY] -transformer ............transformer [NO]............ .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer stochastic_transformer. [NO]. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -ninja .................. fused_adam[OKAY] -............. --------------------------------------------------[NO] - .......op name [OKAY]................ - installed .. fused_lambcompatible -.............-------------------------------------------------- -[NO] ....... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformerfused_adam ......................... [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer fused_lamb .............. [NO][NO] .............. [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatible --------------------------------------------------- -compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] fused_adam....... .............[OKAY] -[NO] ....... [OKAY] -sparse_attn fused_lamb............ .............[NO] [NO]....... .......[OKAY] -[OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer sparse_attn. ............[NO] ....... [OKAY][NO] - ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attnninja .............................. [NO][OKAY] -....... --------------------------------------------------[OKAY] - -op nametransformer ................ ............installed [NO].. .......compatible -[OKAY]-------------------------------------------------- - -stochastic_transformer . cpu_adam[NO] ...................... [YES] [OKAY]...... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportfused_adam - .............-------------------------------------------------- [NO] - NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op........ - [OKAY]-------------------------------------------------- - -JIT compiled ops requires ninja -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -ninja .................. [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... [YES]............... ......[YES] [OKAY]...... - [OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb fused_lamb............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformer transformer............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................. ..................[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... ...............[YES] [YES]...... ......[OKAY] -[OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb .............fused_lamb [NO]............. .......[NO] [OKAY]....... - [OKAY] -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -transformer transformer............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report ---------------------------------------------------DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja --------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. [OKAY] -cpu_adam ...............-------------------------------------------------- -[YES]ninja ......op name..................ninja [OKAY][OKAY].................. -................ - --------------------------------------------------[OKAY]installed - - op name..-------------------------------------------------- -................compatibleop name - installed................ -------------------------------------------------- ..installed - fused_adamcompatible.. -............. -------------------------------------------------- -compatiblecpu_adam - ...............--------------------------------------------------[NO] [YES] - cpu_adam............. ............... [OKAY] [OKAY] -[YES] -cpu_adam ......fused_lamb ............... [OKAY] ............. - fused_adam[YES][NO] ................... [OKAY][NO] - ....... .......fused_adam[OKAY] -............. [OKAY][NO] -fused_lambfused_adam ....... ............. ............. [OKAY] [NO] -[NO] .......fused_lamb....... [OKAY].............[OKAY] - -[NO] fused_lamb....... sparse_attn.............[OKAY] - [NO] ...................sparse_attn [NO] [OKAY] ............ - .......sparse_attn[NO] ............ ....... [NO] [OKAY] [OKAY] -sparse_attn -....... ............transformer[OKAY] -transformer[NO]transformer............ ............ [NO]............[NO]....... ....... ....... [OKAY][OKAY][OKAY] - -[NO] - ....... transformerstochastic_transformerstochastic_transformer ............[OKAY]. . - [NO][NO][NO] stochastic_transformer..................... [OKAY][OKAY] [OKAY] - - -.stochastic_transformer [NO] ........ [OKAY] -[NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -DeepSpeed general environment info: - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -utils .................. [YES] ...... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -quantizer .............. [NO] ....... [OKAY] -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master --------------------------------------------------- -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -utils .................. [YES] ...... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: - [WARNING]  async_io: please install the libaio-devel package with yum -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -DeepSpeed general environment info: -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [YES] ...... [OKAY] -async_io  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found................ -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [NO] ....... [NO] -async_io ............... transformer_inference[NO] ......... [NO][NO] -....... [OKAY] -utils ..................transformer_inference [YES].. ......[NO] [OKAY]....... - [OKAY] -quantizer .............. [NO]utils ......................... [OKAY][YES] - ...... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found................ - [NO] ....... [NO] -async_io ............... transformer_inference[NO] ......... [NO][NO] -....... [OKAY] -utils .................. [YES] ...... [OKAY]transformer_inference - .. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] -utils-------------------------------------------------- -.................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -DeepSpeed general environment info: -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch version .................... 1.8.1 -async_io ............... [NO] ....... [NO] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY]async_io - ............... [NO] ....... [NO]utils - .................. [YES] ...... [OKAY] -quantizer .............. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] --------------------------------------------------- -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. async_io[NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] - ............... [NO] utils....... ..................[NO] -[YES] ...... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -transformer_inference .. --------------------------------------------------[NO] -quantizer .............. [NO] ....... [OKAY] - ....... [OKAY] --------------------------------------------------- -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference ..  [WARNING]  async_io: please install the libaio-devel package with yum[NO] ....... [OKAY] - -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -async_io ............... [NO] ....... [NO] -nvcc version ..................... 11.2 -DeepSpeed general environment info: -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -transformer_inference .. [NO] ....... [OKAY] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -utils .................. [YES] ...... [OKAY] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -quantizer .............. [NO] ....... [OKAY] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -async_io ............... [NO] ....... [NO] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -.. [NO] ....... [OKAY] -utils .................. [YES] ...... async_io[OKAY] -............... [NO]quantizer ..................... [NO][NO] - ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[YES] [OKAY]...... - [OKAY] -quantizerutils ................................ [NO][YES] ............. [OKAY][OKAY] - ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_iotransformer_inference ................. [NO][NO] .............. [OKAY][NO] - -utils .................. [YES] ...... [OKAY] -transformer_inference .. [NO] quantizer....... ..............[OKAY] -[NO] ....... [OKAY] -utils ..................-------------------------------------------------- -[YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] -DeepSpeed general environment info: --------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -async_io ............... [NO] ....... [NO] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... async_io[NO] - ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... utils[OKAY] -.................. [YES] ...... [OKAY] -utils .................. quantizer[YES] .................... [NO][OKAY] - ....... [OKAY] -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ................... ...................0.5.5+cd7967d, cd7967d, master -0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch versionDeepSpeed general environment info: .................... -1.8.1 -torch cuda version torch install path............... ...............11.1 -nvcc version ..................... 11.2 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']deepspeed install path - ...........torch version .................... 1.8.1['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infotorch cuda version .................................. 0.5.5+cd7967d, cd7967d, master11.1 - -deepspeed wheel compiled w.nvcc version ........................... torch 1.8, cuda 11.111.2 - -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................torch version 1.8.1.................... - 1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed info deepspeed info................... ...................0.5.5+cd7967d, cd7967d, master -0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ...............['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -1.8.1 -torch cuda versiontorch version ................................... 11.1 -1.8.1 -nvcc version .....................torch cuda version 11.2............... - deepspeed install path11.1 -...........nvcc version ..................... 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed install pathdeepspeed info .............................. 0.5.5+cd7967d, cd7967d, master -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed wheel compiled w. - ......deepspeed info torch 1.8, cuda 11.1 -................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install pathtorch version .................... ...............1.8.1 -torch cuda version ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']11.1 - -nvcc versiontorch version ......................................... 11.2 -1.8.1 -deepspeed install path ...........torch cuda version ............... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']11.1 - -deepspeed infonvcc version ........................................ 0.5.5+cd7967d, cd7967d, master11.2 - -deepspeed wheel compiled w.deepspeed install path ................. torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+cd7967d, cd7967d, master0.5.5+cd7967d, cd7967d, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1587017.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 55 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.02 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... False - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -> setting tensorboard ... -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io utils............... ..................[NO] [YES]....... ......[NO] -[OKAY] -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. [NO]-------------------------------------------------- -....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+cd7967d, cd7967d, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-18 04:45:50,651] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/data' ->>> done with dataset index builder. Compilation time: 0.302 seconds -WARNING: constraints for invoking optimized fused softmax kernel are not met. We default back to unfused kernel invocations. -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 4.130 seconds -time to initialize megatron (seconds): -28.915 -[after megatron is initialized] datetime: 2021-10-18 04:45:55 -building GPT model ... -[2021-10-18 04:45:55,148] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-18 04:45:55,149] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-18 04:45:55,149] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.55 GB, percent = 21.1% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-18 04:45:56,825] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 -[2021-10-18 04:45:57,517] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-18 04:45:57,518] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-18 04:45:57,518] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.72 GB, percent = 21.2% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-18 04:45:57,519] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+cd7967d, git-hash=cd7967d, git-branch=master -[2021-10-18 04:45:57,556] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-18 04:45:57,556] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-18 04:45:57,556] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-18 04:45:57,557] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-18 04:45:57,557] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-18 04:45:57,557] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-18 04:45:57,557] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-18 04:45:57,557] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-18 04:45:57,557] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-18 04:45:57,557] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] - -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -[2021-10-18 04:45:59,398] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -[2021-10-18 04:45:59,399] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -[2021-10-18 04:45:59,399] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.74 GB, percent = 21.2% -[2021-10-18 04:45:59,444] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-18 04:45:59,445] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-18 04:45:59,445] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.74 GB, percent = 21.2% -[2021-10-18 04:45:59,445] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-18 04:45:59,473] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-18 04:45:59,474] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-18 04:45:59,474] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.74 GB, percent = 21.2% -[2021-10-18 04:45:59,474] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-18 04:45:59,474] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-18 04:45:59,474] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-18 04:45:59,475] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-18 04:45:59,475] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-18 04:45:59,475] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-18 04:45:59,476] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-18 04:45:59,477] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-18 04:45:59,477] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-18 04:45:59,477] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-18 04:45:59,477] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-18 04:45:59,477] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-18 04:45:59,477] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-18 04:45:59,477] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-18 04:45:59,477] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-18 04:45:59,477] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -[2021-10-18 04:45:59,477] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-18 04:45:59,864] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,865] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -WARNING: could not find the metadata file /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. - will not load any checkpoints and will start from random -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-18 04:45:59,960] [WARNING] [engine.py:2020:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -time (ms) | load-checkpoint: 0.56 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432estimated model parameters: 125.22432 - -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.368064 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.368064estimated model parameters without embeddings: 103.368064 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-18 04:45:59 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -> building train, validation, and test datasets ... -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 - > datasets target sizes (minimum size): -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 - train: 600000000 - validation: 3000320 -estimated model parameters without embeddings: 103.3650944 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.038674 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.093 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.057 seconds - total number of samples: 6927161 - total number of epochs: 1 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.003 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-18 04:46:04 -done with setup ... -training ... -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion - -time (ms) | model-and-optimizer-setup: 4874.10 | train/valid/test-data-iterators-setup: 4181.58 -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-18 04:46:04 -[2021-10-18 04:46:04,758] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-18 04:46:04,759] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-18 04:46:04,759] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-18 04:46:04,759] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-18 04:46:04,759] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -srun: Job step aborted: Waiting up to 62 seconds for job step to finish. -Killing subprocess 756918 -Killing subprocess 756919 -Killing subprocess 2064332 -Killing subprocess 756920 -Killing subprocess 756921 -Killing subprocess 2064333 -Killing subprocess 2636667 -Killing subprocess 2865212 -Killing subprocess 2064334 -Killing subprocess 3180688 -Killing subprocess 2636668 -Killing subprocess 2860545 -Killing subprocess 2064335 -Killing subprocess 2865213 -Killing subprocess 3180689 -Killing subprocess 2636669 -Killing subprocess 3069671 -Killing subprocess 2636671 -Killing subprocess 2860546 -Killing subprocess 1543842 -Main process received SIGTERM, exiting -Killing subprocess 3069672 -Main process received SIGTERM, exiting -Killing subprocess 2860547 -Killing subprocess 2865214 -Killing subprocess 1543370 -Killing subprocess 2865216 -Killing subprocess 3180690 -Main process received SIGTERM, exiting -Killing subprocess 3180691 -Killing subprocess 3069673 -Killing subprocess 1556692 -Killing subprocess 2860548 -Killing subprocess 3069674 -Killing subprocess 1543843 -Killing subprocess 1543844 -Killing subprocess 1543371 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 1543846 -Killing subprocess 1550946 -Killing subprocess 1540564 -Killing subprocess 1543372 -Main process received SIGTERM, exiting -Killing subprocess 1556693 -Killing subprocess 1550947 -Main process received SIGTERM, exiting -Killing subprocess 1540565 -Killing subprocess 4003058 -Killing subprocess 1546437 -Killing subprocess 1556694 -Killing subprocess 1556695 -Killing subprocess 1550948 -Killing subprocess 1540566 -Killing subprocess 1550949 -Main process received SIGTERM, exiting -Killing subprocess 1546438 -Killing subprocess 1543374 -Main process received SIGTERM, exiting -Killing subprocess 4003059 -Killing subprocess 1546439 -Killing subprocess 1543778 -Killing subprocess 4003060 -Killing subprocess 20328 -Killing subprocess 1540567 -Killing subprocess 1543779 -Main process received SIGTERM, exiting -Killing subprocess 4003062 -Killing subprocess 1546440 -Killing subprocess 20329 -Main process received SIGTERM, exiting -Killing subprocess 1544439 -Main process received SIGTERM, exiting -Killing subprocess 20330 -Killing subprocess 1543780 -Killing subprocess 1650226 -Killing subprocess 20331 -Killing subprocess 1544440 -Killing subprocess 1817676 -Killing subprocess 1650227 -Killing subprocess 1544441 -Killing subprocess 394276 -Main process received SIGTERM, exiting -Killing subprocess 1650228 -Killing subprocess 377681 -Main process received SIGTERM, exiting -Killing subprocess 1817677 -Killing subprocess 1544442 -Main process received SIGTERM, exiting -Killing subprocess 1288766 -Killing subprocess 394277 -Killing subprocess 1650229 -Killing subprocess 1817678 -Killing subprocess 568044 -Killing subprocess 394278 -Killing subprocess 356775 -Killing subprocess 1543781 -Killing subprocess 2203462 -Killing subprocess 1935002 -Killing subprocess 1817680 -Killing subprocess 4109197 -Killing subprocess 627666 -Killing subprocess 377682 -Main process received SIGTERM, exiting -Killing subprocess 396853 -Killing subprocess 482551 -Killing subprocess 1288767 -Killing subprocess 3396833 -Main process received SIGTERM, exiting -Killing subprocess 568045 -Killing subprocess 356776 -Killing subprocess 4109198 -Killing subprocess 627667 -Killing subprocess 394279 -Killing subprocess 1935003 -Killing subprocess 356777 -slurmstepd: error: *** STEP 1587017.0 ON r6i4n4 CANCELLED AT 2021-10-18T04:51:36 *** -Killing subprocess 377683 -Killing subprocess 2203463 -Killing subprocess 482552 -Killing subprocess 396854 -Main process received SIGTERM, exiting -Killing subprocess 3396834 -Killing subprocess 4109199 -Killing subprocess 627668 -Killing subprocess 377684 -Killing subprocess 482553 -Killing subprocess 3396835 -Killing subprocess 1935004 -Killing subprocess 1288768 -Killing subprocess 396855 -Killing subprocess 1288770 -Killing subprocess 627670 -Killing subprocess 4109201 -Killing subprocess 2203464 -Main process received SIGTERM, exiting -Killing subprocess 2203465 -Killing subprocess 3396836 -Killing subprocess 482554 -Main process received SIGTERM, exiting -Killing subprocess 1935005 -Main process received SIGTERM, exiting -Killing subprocess 568046 -Killing subprocess 568047 -Main process received SIGTERM, exiting -Killing subprocess 396856 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 356779 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 717811 -Killing subprocess 717812 -Killing subprocess 717813 -Killing subprocess 717814 -Main process received SIGTERM, exiting -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name -op name op name................op name ................installed................ ................ installed ..installed installed ..compatible - ....--------------------------------------------------compatible -compatiblecompatible - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -cpu_adam ............... [NO] ....... [OKAY]cpu_adamcpu_adam - .............................. cpu_adam[NO][NO] ............................. [NO] fused_adam [OKAY][OKAY] -....... -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name - op name ................ op name................................ installed................installed installed .. installedcompatible.... - compatible--------------------------------------------------compatible.. - - - --------------------------------------------------compatible-------------------------------------------------- - - --------------------------------------------------- -cpu_adam ...............cpu_adam [NO]cpu_adam ...............cpu_adam....... ............... [NO]...............[NO][OKAY] -.......[NO]....... [OKAY] ....... -[OKAY] fused_adam -[OKAY] -............. [NO] ....... [OKAY] -fused_adamfused_adamfused_lamb .......................................fused_adam [NO] [NO] [NO]............. ....... .............. [OKAY][NO][OKAY] - -[OKAY]....... - fused_lamb[OKAY] -.............fused_lamb [NO]............. fused_lamb....... [NO].............sparse_attn[OKAY] - ...................[NO] [OKAY][NO]....... - .......[OKAY] -[OKAY] -sparse_attn ............transformer [NO]............ sparse_attn[NO]....... ...................[OKAY] -[NO][OKAY]sparse_attn -transformer....... ............[OKAY] stochastic_transformer............ - [NO][NO].transformer .......[NO]............ .......[NO].......[OKAY] -.......[OKAY][OKAY] - -[OKAY]stochastic_transformer - .transformer stochastic_transformer [NO] ............ .......[NO]. [OKAY][NO] - ....... [OKAY] - ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - .............[OKAY] -[NO] ....... [OKAY] -fused_adamfused_lambfused_adamfused_adam .................................................... [NO][NO][NO] .......[NO] .............. .......[OKAY][OKAY] [OKAY][OKAY] - - - -fused_lambfused_lamb ............. fused_lamb ............. [NO] ............. [NO].......sparse_attn [OKAY][NO] - .......................... [NO][OKAY] [OKAY] - -....... [OKAY] -sparse_attn ............ [NO]transformer ................... [NO][OKAY] -sparse_attn.......sparse_attn transformer .................................... [OKAY] [NO] -[NO] [NO] ....... ....... stochastic_transformer .......[OKAY][OKAY]. - - [OKAY][NO] - transformer.......stochastic_transformer ............[OKAY]transformer. - ............[NO] [NO] [NO] ..................... [OKAY] [OKAY] -[OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... [OKAY][OKAY].................. - - [OKAY]----------------------------------------------------------------------------------------------------[OKAY] - - - ---------------------------------------------------op nameop name --------------------------------------------------................ - - op nameinstalledop name................ .................. ................ installed compatibleinstalled installed - ..--------------------------------------------------.... - compatiblecompatiblecompatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- -cpu_adam - - ............... [NO] ....... [OKAY] -cpu_adam cpu_adamcpu_adam............... ..............................[NO] [NO][NO]....... fused_adam ....... ....... [OKAY] [OKAY] -[OKAY]............. - - [NO] ....... [OKAY] -fused_lamb fused_adam............. fused_adam[NO]fused_adam............. .......................... .......[NO] [NO] [NO] ....... [OKAY]..............[OKAY] - -[OKAY][OKAY] - -fused_lambfused_lambfused_lamb ....................................... [NO]sparse_attn[NO][NO] ................................. [NO] [OKAY][OKAY][OKAY] - - -....... [OKAY] -transformer ............ [NO] ....... [OKAY] -sparse_attnsparse_attn sparse_attn ............ ............ stochastic_transformer............ [NO] [NO][NO] ...................... [OKAY] -[NO][OKAY][OKAY] - -transformer....... transformer............transformer[OKAY] -............[NO] ............ [NO]....... [NO].......[OKAY] .......[OKAY] - -[OKAY] -stochastic_transformer stochastic_transformerstochastic_transformer. .[NO]. [NO].......[NO] .......[OKAY]....... - [OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninja ninja .................................... ....................................[OKAY][OKAY] -[OKAY] -[OKAY]-------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op name--------------------------------------------------op name - - ................................ op nameinstalledop name installed.................................. compatible ..installed -installed -------------------------------------------------- compatible.... - - --------------------------------------------------compatiblecompatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... cpu_adam[NO] ......................cpu_adam cpu_adam [OKAY]............... - [NO]............... [NO].......[NO] .......[OKAY]....... - [OKAY][OKAY] -fused_adam - ............. [NO] ....... [OKAY] -fused_adam ............. fused_adam[NO] fused_adam fused_lamb.................... ..........................[OKAY] [NO] -[NO][NO] fused_lamb....... .............. ............. [OKAY][OKAY][OKAY][NO] - - -....... fused_lamb[OKAY] -fused_lamb .......................... [NO][NO] .............. [OKAY]sparse_attn[OKAY] - -............sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformer transformer............ ............sparse_attn[NO] sparse_attn [NO] ...................................... [OKAY][NO][OKAY][NO] - - .............. [OKAY][OKAY] -stochastic_transformerstochastic_transformer - transformer ..transformer............ [NO]............[NO][NO] ..............[NO]....... [OKAY].......[OKAY][OKAY] - - -[OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................................... [OKAY] [OKAY] -[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- -op name -op name op nameop name................ ................................installed................ installedinstalled..installed .... compatible compatiblecompatible -.. --------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------compatible - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam..............................cpu_adam [NO] ............... ...............[NO] ..............[NO] [NO] [OKAY][OKAY] ....... - -....... [OKAY][OKAY] - -fused_adamfused_adam .......................... fused_adamfused_adam[NO] [NO] .......................... ....... ....... [NO][OKAY][NO][OKAY] - - .............. fused_lambfused_lamb[OKAY] [OKAY] -.......................... - [NO][NO]fused_lamb ..............fused_lamb [OKAY] ............. -[OKAY]............. - [NO][NO] .............. [OKAY][OKAY] - -sparse_attn sparse_attn............ ............[NO] [NO]....... sparse_attnsparse_attn....... [OKAY] ........................ - [OKAY] [NO] -[NO]transformer .......transformer................... ............[OKAY][NO] [OKAY] - [NO] -....... transformer....... transformer[OKAY] ............ - [OKAY]............[NO] - [NO].......stochastic_transformer [OKAY]stochastic_transformer....... - . .[NO][OKAY] stochastic_transformer[NO] - ............... [OKAY]stochastic_transformer - [OKAY] - [NO] ........ [OKAY][NO] - ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report------------------------------------------------------------------------------------------------------------------------------------------------------ - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaJIT compiled ops requires ninja - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY] - [OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -op name--------------------------------------------------op name -op name................................ op name ................ installedinstalled................ installed.. ..installed ..compatible compatible -..--------------------------------------------------compatible - --------------------------------------------------- - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adam cpu_adam............... cpu_adam ...............cpu_adam [NO] [NO]............... ...................... ....... [NO] [NO] [OKAY][OKAY] ....... - -....... [OKAY][OKAY] - -fused_adamfused_adam .............fused_adamfused_adam............. .............[NO].............[NO] [NO][NO].............. .......[OKAY].......[OKAY] - -[OKAY][OKAY] - -fused_lambfused_lamb fused_lamb..........................fused_lamb [NO][NO]............. ............. ....... ....... [NO][NO] [OKAY] [OKAY] - ....... -....... [OKAY][OKAY] - -sparse_attnsparse_attn ........................sparse_attnsparse_attn [NO]............[NO]............ [NO].............. [NO] [OKAY].......[OKAY] - -.......[OKAY] -[OKAY]transformertransformer -transformer ........................ transformer ............[NO] [NO] ............[NO] ....... .......[NO]....... [OKAY]....... [OKAY] - [OKAY] -[OKAY] - -stochastic_transformerstochastic_transformerstochastic_transformer stochastic_transformer ... . [NO][NO][NO] [NO]..................... [OKAY] .......[OKAY] - -[OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... ..................[OKAY][OKAY] [OKAY] - -[OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name op name ................ ................................ ................ installed installedinstalledinstalled .. .. .... compatible -compatiblecompatiblecompatible --------------------------------------------------- - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -cpu_adam ...............cpu_adam cpu_adam[NO] cpu_adam.............................. .......[NO]...............[NO] [OKAY].......[NO]....... - .......[OKAY][OKAY] - -[OKAY] -fused_adam ............. [NO] .......fused_adamfused_adam [OKAY].............fused_adam............. - [NO].............[NO] fused_lamb ....... [NO] .................... [OKAY] ....... -[NO][OKAY] -[OKAY]....... -fused_lamb [OKAY]fused_lamb............. -fused_lamb ............. [NO] ............. [NO] ....... [NO] ..............[OKAY] -[OKAY][OKAY]sparse_attn - - ............ [NO] ....... [OKAY] -transformer ............ [NO] .......sparse_attn sparse_attn [OKAY] sparse_attn........................ -............[NO][NO] [NO]....... .......stochastic_transformer ....... [OKAY] [OKAY] -.[OKAY] - -[NO]transformertransformer transformer ................... ........................[NO][OKAY] - [NO][NO]....... [OKAY].............. - [OKAY][OKAY] - -stochastic_transformer stochastic_transformer.stochastic_transformer [NO]. . ....... [NO][NO][OKAY] - .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... [OKAY]..................[OKAY][OKAY] - - -[OKAY]---------------------------------------------------------------------------------------------------- --------------------------------------------------- -op name - - --------------------------------------------------................op nameop name - installed op name................ ................ installed.. installed................compatible.. -..installedcompatible --------------------------------------------------compatible - -.. ----------------------------------------------------------------------------------------------------- - -compatible --------------------------------------------------- -cpu_adam ............... cpu_adam[NO]cpu_adam .......cpu_adam.............................. [OKAY]...............[NO] - [NO]....... .......[NO] [OKAY][OKAY]....... - - fused_adam[OKAY] -............. [NO] ....... [OKAY]fused_adam - ............. [NO] fused_lamb....... fused_adam[OKAY].............fused_adam - .............[NO] .................... fused_lamb [NO][OKAY][NO] -............. ....... ....... [NO] [OKAY] [OKAY] -....... - [OKAY] -fused_lambfused_lamb ..........................sparse_attn [NO][NO]............ .......[NO]....... .......sparse_attn[OKAY] [OKAY] -............[OKAY] - - [NO] .......transformer [OKAY]............ - [NO]transformer ................... sparse_attn[OKAY][NO] sparse_attn - ............ ....... ............[NO][OKAY]stochastic_transformer -[NO] ..............stochastic_transformer . [OKAY] [OKAY][NO]. - -....... transformer [NO]transformer[OKAY] -............ ....... ............ [NO] [OKAY] -[NO]....... .......[OKAY] -[OKAY] -stochastic_transformerstochastic_transformer .. [NO] [NO]....... .......[OKAY] -[OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report --------------------------------------------------- - - -------------------------------------------------------------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja -JIT compiled ops requires ninja --------------------------------------------------- - -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - --------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ...................................................... .................. [OKAY] -[OKAY][OKAY][OKAY]-------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- ---------------------------------------------------op name - - op name................op name op name ................................installed ................ ..installedinstalledinstalled compatible .. -.. --------------------------------------------------compatible -.. -compatible -------------------------------------------------- -compatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam ....... cpu_adam...............[OKAY]cpu_adam -............... [NO]...............[NO] .......[NO]....... [OKAY].......fused_adam[OKAY] - -[OKAY]............. - [NO] ....... [OKAY] -fused_adamfused_lamb fused_adam..........................fused_adam .............[NO][NO]............. .......[NO][NO] ....... ....... [OKAY].......[OKAY][OKAY] - - -[OKAY] -fused_lambfused_lambfused_lamb ....................................... [NO][NO][NO]sparse_attn ................................. [OKAY][NO] [OKAY].......[OKAY] - - -[OKAY] -transformer ............ [NO] ....... sparse_attnsparse_attn[OKAY] sparse_attn............ - ........................[NO] [NO]stochastic_transformer [NO] ....... ....... ....... . [OKAY][OKAY][OKAY][NO] - - -....... transformer[OKAY]transformertransformer - .................................... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -stochastic_transformerstochastic_transformer stochastic_transformer .. . [NO] [NO] [NO] .............. [OKAY].......[OKAY] - -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY] [OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op nameop nameop name - ................................................ op name installedinstalled installed .................... compatible..compatibleinstalled - --------------------------------------------------- compatible-------------------------------------------------- - - -..-------------------------------------------------- -compatible -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY][OKAY] ---------------------------------------------------cpu_adam -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -cpu_adam cpu_adam.............................. [NO]............... [NO] ..............cpu_adam [NO] [OKAY] -[OKAY]...................... - [OKAY][NO] -op nameop nameop name op name ................................ ................ ................installedinstalledinstalled ..installed.. .. ..compatiblecompatiblecompatible - - ....... [OKAY] -fused_adam .............fused_adam [NO]............. fused_adam....... [NO].............[OKAY] - -compatible---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -....... fused_adam[NO][OKAY] fused_lamb -....... .......................... [OKAY][NO]fused_lamb[NO] --------------------------------------------------- - ........................... fused_lamb[NO][OKAY] -[OKAY].................... -cpu_adamcpu_adamcpu_adam cpu_adam.............................. ............... ............... [NO] [NO][NO] [NO] .............. ..............[OKAY] [OKAY] -[OKAY] -[OKAY] - - [NO][OKAY] - ....... fused_lamb[OKAY] -fused_adam .............fused_adam fused_adamfused_adam [NO] ............. ............. .................... [NO] [NO][OKAY][NO]....... -............. sparse_attn[NO] ............ .......[NO] sparse_attn ....... sparse_attn ............[OKAY] [OKAY] - - ....... ....... [OKAY]fused_lamb -[OKAY] [OKAY] -............. - [NO] .......fused_lamb fused_lamb fused_lamb[OKAY] ............. -............[NO] transformer [NO] ....... ............ ....... [OKAY][NO] - [OKAY].......transformer -.......................... [NO] [NO] [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] -sparse_attn - ............ [NO] ....... [OKAY] - [OKAY]sparse_attn -transformer ............ sparse_attn[NO]sparse_attn sparse_attn ............................... [OKAY][NO] -............ [NO] ....... [NO] ....... stochastic_transformer[OKAY] -............transformer ............[NO]stochastic_transformer............ [NO].[NO]....... .......[NO].......[OKAY] -.......[OKAY] [OKAY] -[OKAY] - -[OKAY]....... - transformer. [OKAY] ............ -stochastic_transformer stochastic_transformer .transformer. [NO][NO]............ ....... .......[OKAY][NO] - [OKAY] -....... [OKAY] -transformer[NO] transformer[NO]............ ..........................[NO] [OKAY][OKAY].......[NO] - - [OKAY]....... - [OKAY]stochastic_transformer -stochastic_transformer . [NO] ....... [OKAY] - stochastic_transformer stochastic_transformer. .[NO] .[NO] .......[NO]....... [OKAY][OKAY]....... - - [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] -[OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name op name ................ ................................ ................installed installedinstalledinstalled.. ...... compatible compatiblecompatiblecompatible - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -cpu_adamcpu_adamcpu_adamcpu_adam ............... ............... ...............[NO] ...............[NO].......[NO] ....... [OKAY].......[NO] - [OKAY] ....... -[OKAY] -[OKAY] -fused_adam ............. fused_adam fused_adam[NO]fused_adam ............. .................... .............[NO] [NO][OKAY] [NO] -.............. [OKAY]fused_lamb.......[OKAY] - [OKAY] -............. - [NO]fused_lamb fused_lamb .......fused_lamb ............. ............. [OKAY]............. [NO] - [NO] [NO] ....... ....... ....... [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO] ....... [OKAY] -transformersparse_attnsparse_attnsparse_attn ............ ........................ ............ [NO][NO][NO] [NO] ..................... .......[OKAY] [OKAY] [OKAY] - -[OKAY] - -stochastic_transformertransformertransformer transformer ............ ............. ............ [NO][NO] [NO].......[NO] [OKAY].............. -....... [OKAY][OKAY][OKAY] -stochastic_transformer - - . stochastic_transformerstochastic_transformer [NO] ......... [OKAY][NO][NO] - .............. [OKAY][OKAY] - -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] -[OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op nameop name - op name................op name ................installed ................ ..................installed compatibleinstalledinstalled.. - .. --------------------------------------------------..compatible - -compatiblecompatible --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -cpu_adam ............... [NO] .......cpu_adam cpu_adam [OKAY]............... cpu_adam -............... [NO]...............[NO] .......[NO]....... [OKAY].......[OKAY]fused_adam - - [OKAY]............. -[NO] ....... [OKAY] -fused_adam fused_lambfused_adam.............fused_adam .......................................[NO] .......[NO][NO] [NO] ....... [OKAY] -..............[OKAY] - [OKAY][OKAY] -fused_lamb - ............. [NO]fused_lambfused_lamb ................................. [OKAY]sparse_attn[NO][NO] - .......................... [OKAY][NO][OKAY] - -....... [OKAY] -sparse_attntransformer ........................ [NO][NO] ..............sparse_attn sparse_attn [OKAY][OKAY] - - ........................ transformer[NO]stochastic_transformer[NO] ................... ........[NO][OKAY] -[NO].......[OKAY] transformer....... -[OKAY]............[OKAY] - -transformer[NO] stochastic_transformer................... [OKAY][NO]. - .......[NO] .......stochastic_transformer [OKAY] [OKAY] - -. [NO] stochastic_transformer....... [OKAY] -. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------JIT compiled ops requires ninja - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... [OKAY].................. - [OKAY][OKAY][OKAY] --------------------------------------------------- - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - - op nameop name................ op name ................................................installed installedinstalled..installed ....compatible.. - compatible compatible-------------------------------------------------- -compatible - --------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [NO]cpu_adamcpu_adam cpu_adam ...................... ..............................[OKAY][NO] - [NO][NO]....... .............. [OKAY] [OKAY] - -[OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_adamfused_adam fused_lamb ............. fused_adam.......................... [NO] ............. [NO] .......[NO][NO] .......[OKAY].............. -[OKAY][OKAY][OKAY] - - -fused_lamb ............. [NO]fused_lamb .......fused_lamb............. [OKAY].............[NO] - [NO]sparse_attn....... ...................[OKAY] -[OKAY][NO] - ....... [OKAY]sparse_attn - ............ transformer[NO] ................... [OKAY][NO] - sparse_attn....... ............transformersparse_attn[OKAY] - [NO]........................ ....... stochastic_transformer[NO][NO] [OKAY]............... - [OKAY][OKAY] -[NO]transformer - .......stochastic_transformer............transformer [OKAY][NO]............. - .......[NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -stochastic_transformer .stochastic_transformer [NO] ....... .[OKAY] -[NO] ....... [OKAY] -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - -ninjaninja---------------------------------------------------------------------------------------------------- - -.................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adamop name op name ............................... ...............................[NO] installed [NO] .......installed......... ..[OKAY][OKAY] -compatible -compatible - ----------------------------------------------------------------------------------------------------- - -fused_adam fused_adam............. cpu_adamcpu_adam ............. [NO] .............................. [NO] .......[NO] [NO] .............. [OKAY] .......[OKAY] - -[OKAY] -[OKAY] -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] -fused_adam -fused_adam .......................... [NO][NO] .............. [OKAY] -[OKAY] -sparse_attn fused_lambsparse_attn............fused_lamb ............[NO] ............. .................... [NO] [OKAY][NO][NO] - ..................... [OKAY]transformer[OKAY][OKAY] - - -............transformer ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer stochastic_transformersparse_attn. sparse_attn ............[NO]............. .......[NO][NO][NO] ..............[OKAY]....... -[OKAY][OKAY][OKAY] - - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer stochastic_transformer . [NO]. .......[NO] [OKAY]....... - [OKAY] -ninjaninjaninjaninja .................................... .................. ..................[OKAY] [OKAY] -[OKAY][OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name -op name op name ................ ................................................ installedinstalledinstalledinstalled .... .. .. compatible compatible -compatiblecompatible --------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system -cpu_adam cpu_adam...............cpu_adamcpu_adam [NO]............................................. [NO] [NO] ....... [NO][OKAY] ....... -.............. [OKAY][OKAY][OKAY] - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - - -fused_adam ............. [NO] ....... fused_adam[OKAY]fused_adamfused_adam -....................................... [NO] fused_lamb[NO] [NO] ....... ....... ....... ............. [OKAY][OKAY] [OKAY] - - -[NO] ....... [OKAY]fused_lambfused_lamb -fused_lamb ....................................... [NO][NO][NO] ..................... sparse_attn [OKAY][OKAY][OKAY] - - -............ [NO] ....... [OKAY] -transformer ............ [NO] .......sparse_attn sparse_attnsparse_attn [OKAY] ........................ -............ [NO][NO]stochastic_transformer[NO] ..................... . [OKAY][OKAY] [OKAY] - -[NO] - transformer.......transformer ............[OKAY] transformer ............ -[NO] ............[NO]....... [OKAY].......[NO] - [OKAY]....... - [OKAY]stochastic_transformer stochastic_transformer - . .stochastic_transformer[NO] .......[NO] . [OKAY] -....... [NO][OKAY] -....... [OKAY] -ninjaninjaninja ninja .................................... .................. ..................[OKAY] -[OKAY][OKAY][OKAY]-------------------------------------------------- - - - -----------------------------------------------------------------------------------------------------op name - - op name--------------------------------------------------................op name - ................ installed op name................ installed ................ .. ..installed compatible - --------------------------------------------------installedcompatible - -....-------------------------------------------------- -compatiblecompatiblecpu_adam - - --------------------------------------------------...............-------------------------------------------------- - -[NO] ....... [OKAY]cpu_adam - ............... cpu_adamcpu_adam .............................. [NO][NO] [NO] .......fused_adam ....... ....................[OKAY] [OKAY][NO] - - [OKAY]....... -[OKAY] -fused_lamb fused_adam............. [NO].............fused_adam fused_adam ....... [NO]............. ............. [OKAY] ....... -[NO] [NO] [OKAY] ....... -....... [OKAY][OKAY]fused_lamb - - ............. fused_lamb[NO]sparse_attnfused_lamb ............. ................................[NO] [NO][OKAY][NO] - ....... ....... ....... [OKAY] -[OKAY] -[OKAY]transformer - ............ [NO] .......sparse_attn [OKAY]............ -sparse_attn [NO]............ stochastic_transformer ....... sparse_attn[NO] . ............[OKAY] ....... [NO] - [OKAY][NO]....... - transformer.......[OKAY] - transformer............[OKAY] ............[NO] [NO]....... - .......[OKAY] -[OKAY]transformer - ............stochastic_transformer stochastic_transformer . .[NO][NO] .......[NO]....... .......[OKAY] -[OKAY][OKAY] - -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------JIT compiled ops requires ninja ---------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja .................................... .................................... [OKAY][OKAY] [OKAY] -[OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name ................op name................................ installed................installedinstalled installed...... compatiblecompatiblecompatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -ninjaninjaninjaninja .................................... .................. [OKAY].................. [OKAY] - [OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -.. compatible -cpu_adam--------------------------------------------------cpu_adam --------------------------------------------------- -op name ---------------------------------------------------op name op name -cpu_adam ............................................. [NO][NO][NO] ..................... [OKAY][OKAY]cpu_adam[OKAY] - - - ................ op name................ ................ installedinstalled................ installed .. ..installed .. compatiblecompatible .. - -............... [NO] ....... [OKAY] -compatible-------------------------------------------------- - -------------------------------------------------- ---------------------------------------------------compatible - - -fused_adamfused_adam fused_adam ............. ............. ............. [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - --------------------------------------------------- -fused_lambfused_adam fused_lamb............. ............. fused_lamb[NO] ............. [NO] ............. [NO]....... ....... [NO].......[OKAY][OKAY] - -cpu_adam cpu_adam...............cpu_adam [NO]...............cpu_adam ...................... [NO]...............[NO][OKAY] - .......[NO]....... [OKAY].......[OKAY] - -.......[OKAY] -[OKAY] -[OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... sparse_attnsparse_attn[OKAY] -fused_adamfused_lambfused_adam fused_adam ............. .......................... .............[NO][NO] [NO][NO]....... ....... .............. [OKAY] [OKAY] - - ............ ............sparse_attn [NO]............ .......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -[OKAY][OKAY] - -transformer ............transformer transformer [NO] ............ ............ ....... [NO] [NO] [OKAY] ....... - .......sparse_attn[OKAY] -[OKAY]stochastic_transformer............ -fused_lamb ............. fused_lambfused_lamb[NO] ................................. [OKAY][NO]sparse_attn[NO] - stochastic_transformer[NO]. stochastic_transformer....... .[NO] [OKAY][NO]........ - .......[OKAY][NO] - .......................... [NO][OKAY][OKAY] - -....... [OKAY] -transformer [OKAY]....... - [OKAY] -............ [NO] ....... [OKAY] -transformersparse_attn ........................ [NO] [NO].......sparse_attn sparse_attn [OKAY]................... -stochastic_transformer . [NO] ....... [OKAY] -............[OKAY][NO] - stochastic_transformer[NO]....... .......transformer[OKAY] -. ............[NO][OKAY]transformer - [NO]...................transformer ....... [NO][OKAY] - ............ [OKAY] ....... -[NO] [OKAY]....... - [OKAY] -stochastic_transformer . stochastic_transformer[NO]stochastic_transformer . ....... .[NO][OKAY] -[NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja .................. ......................................................[OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -[OKAY][OKAY]--------------------------------------------------[OKAY] - - - -op name---------------------------------------------------------------------------------------------------- -------------------------------------------------- - -................ -op name op nameop name installed ................ ................................ installed.. installedinstalled ..compatible.. - .. compatible --------------------------------------------------compatible -compatible - - --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam ...................... cpu_adam[OKAY] cpu_adam [NO] -............... ......................[NO] [NO].......[OKAY] fused_adam -[OKAY] -.................... [OKAY][NO] - ....... [OKAY] -fused_lambfused_adam fused_adam ..........................fused_adam............. [NO] [NO][NO] ............. ....... ..............[OKAY] [NO] - [OKAY][OKAY]....... - - fused_lamb [OKAY]fused_lamb............. - .............[NO] [NO]....... fused_lamb ....... [OKAY][OKAY]............. -sparse_attn - [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformersparse_attn sparse_attn........................ ............sparse_attn[NO] [NO] [NO]................... .............. [OKAY][NO][OKAY][OKAY] - - -.......transformerstochastic_transformer transformer [OKAY]......................... [NO] - [NO] transformer .......[NO]....... ............[OKAY].......[OKAY] - - [NO][OKAY] -stochastic_transformer....... stochastic_transformer[OKAY] . -. [NO] stochastic_transformer [NO] ....... .......[OKAY] . -[OKAY] -[NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name -op name ................................op name................ installed ................ installedinstalled .. installed .... compatible compatible.. -compatible - -----------------------------------------------------------------------------------------------------compatible-------------------------------------------------- - - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam .............................. [NO][NO] cpu_adam.............. ............... [OKAY][OKAY]............... -[NO] - .......[NO] [OKAY] ....... -[OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] -fused_adam - .............fused_lambfused_lamb fused_adam ............. [NO]............. .............[NO] ....... [NO] ....... [OKAY] [NO].......[OKAY] -[OKAY]....... - - [OKAY] -fused_lamb ............. [NO] fused_lamb .................... sparse_attnsparse_attn [NO] [OKAY]............ ............ - .......[NO][NO] ..............[OKAY] [OKAY] -[OKAY] - -transformer transformer............ ............[NO] [NO] .............. [OKAY][OKAY]sparse_attn - - ............ [NO] stochastic_transformerstochastic_transformer....... sparse_attn [OKAY] .. -............ [NO][NO] [NO] ....... .......transformer ....... [OKAY][OKAY]............ - - [OKAY][NO] - .......transformer [OKAY] -............ [NO] ....... stochastic_transformer [OKAY] -. [NO] ....... [OKAY]stochastic_transformer - . [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja .................. .................. [OKAY].................................... - [OKAY][OKAY]--------------------------------------------------[OKAY] - - - ---------------------------------------------------op name-------------------------------------------------- -------------------------------------------------- - -................ -op nameop name op nameinstalled ................ ................ ..installed ................ installedcompatibleinstalled -.. --------------------------------------------------....compatible - - compatible-------------------------------------------------- -compatible --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [NO] cpu_adam....... cpu_adam cpu_adam............... ...............[OKAY][NO]............... - [NO] ....... [NO] .......[OKAY]....... - [OKAY][OKAY] -fused_adam - ............. [NO] ....... [OKAY] -fused_adam .............fused_lamb fused_adamfused_adam [NO] .......................... [NO].................... [NO] .......[NO]....... [OKAY][OKAY] - -[OKAY]....... - fused_lamb[OKAY] -.............fused_lamb [NO]............. fused_lamb ....... [NO] ............. [OKAY]sparse_attn ....... - [NO]............ .......[NO][OKAY] -[OKAY]....... - [OKAY] -sparse_attn transformer............ ............[NO] [NO]....... [OKAY]....... - sparse_attn[OKAY] -transformersparse_attn ........................ stochastic_transformer ............[NO][NO] ........[NO]....... [NO] .......[OKAY].......[OKAY] - -[OKAY][OKAY] - -transformerstochastic_transformertransformer ........................ . [NO] [NO][NO]....... ..............[OKAY] [OKAY] - -[OKAY] -stochastic_transformerstochastic_transformer .. [NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... ..................[OKAY][OKAY][OKAY] - - -[OKAY]---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op nameop nameop name - ................................op name................ installed installed installed.................. installed .. ....compatiblecompatible -compatible -compatible-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - - --------------------------------------------------- -cpu_adam cpu_adamcpu_adam............... cpu_adam.............................. [NO] ............... [NO][NO] [NO] .............. ....... [OKAY] [OKAY].......[OKAY] - - -[OKAY] -fused_adamfused_adamfused_adam ....................................... [NO][NO][NO] fused_adam ....... .............. ............. [OKAY][OKAY][OKAY] -[NO] - -fused_lambfused_lamb fused_lamb .................... ............. ............. [NO][OKAY] [NO] ....... [OKAY] -[NO] - .............. fused_lamb [OKAY].............[OKAY] - -[NO] ....... [OKAY] -sparse_attn ............ [NO] ....... sparse_attn[OKAY] -sparse_attn............ ............transformer[NO] [NO]................... .......[NO][OKAY]sparse_attn [OKAY] -....... -............ transformertransformer [OKAY] [NO]............ ............ - ....... [NO] [NO] [OKAY]stochastic_transformer....... - .......[OKAY]. -[OKAY][NO]transformer - ...................stochastic_transformer stochastic_transformer [OKAY] -.[NO]. [NO].......[NO] ..............[OKAY] - [OKAY] -[OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- -op name -op name - op name................................ op name ................installed installed ....................installed compatible..installedcompatible - - --------------------------------------------------compatible..-------------------------------------------------- - - - compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam .............................. [NO]cpu_adam[NO]cpu_adam ............................................ [NO] [OKAY][OKAY][NO] - - .............. [OKAY][OKAY] - -fused_adamfused_adam ..........................fused_adam fused_adam [NO] [NO]............. .................... ....... [NO][OKAY][NO][OKAY] - - .......fused_lamb....... fused_lamb[OKAY] ............. [OKAY]............. - - [NO] [NO]fused_lambfused_lamb....... .................................[OKAY] [OKAY] [NO] -[NO] - .............. [OKAY][OKAY] - -sparse_attn ............ sparse_attn[NO] ................... sparse_attnsparse_attn[OKAY][NO] - ........................ .......transformer [NO] [NO] ............ [OKAY]....... -.......[NO] transformer[OKAY][OKAY]....... - -............[OKAY] transformer -transformer[NO] ...............................stochastic_transformer . [NO][OKAY][NO][NO] - ..................... stochastic_transformer [OKAY][OKAY][OKAY] - - -. [NO]stochastic_transformer stochastic_transformer ....... .[OKAY]. - [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninja ninja...................................................... [OKAY]..................[OKAY][OKAY] - - -[OKAY]---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op nameop name -op name op name................................ ................ ................ installed installedinstalled installed ...... compatiblecompatiblecompatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -cpu_adam ................. -cpu_adam [NO]............... .......[NO] [OKAY]....... - cpu_adam[OKAY] compatible -............... - fused_adam[NO]--------------------------------------------------fused_adam ................................. -[OKAY][NO] - [NO]....... .......[OKAY] -[OKAY] -fused_lambcpu_adam .............fused_lamb [NO]fused_adam............. ....................[NO] [OKAY]......................[NO] - [NO][OKAY]....... - [OKAY] - .......fused_lamb ............. [NO] [OKAY].......sparse_attn sparse_attn [OKAY] ............ - -............ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ sparse_attn[NO][NO] .......................... [NO][OKAY][OKAY] - -....... [OKAY] -stochastic_transformer stochastic_transformertransformer. ............[NO] .[NO]....... [NO].......[OKAY] -.......[OKAY] -[OKAY] -fused_adam .............stochastic_transformer . [NO] ....... [OKAY] -[NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja .................................... .................. .................. [OKAY][OKAY] - -[OKAY][OKAY]---------------------------------------------------------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- -op nameop name - op name op name ................ installed................................ ..................installed installed compatible..installed - --------------------------------------------------..compatible.. - - compatible--------------------------------------------------compatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... cpu_adam[NO] ...............cpu_adam.......cpu_adam [NO] ...............[OKAY] -............... ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY]fused_adam - .............fused_adam fused_adam [NO]fused_lamb ............. .................... ............. [NO] [NO][OKAY][NO] - ..................... [OKAY][OKAY][OKAY] - - -fused_lamb ............. fused_lamb[NO]fused_lamb ................................. [OKAY][NO] -[NO] sparse_attn.............. ............[OKAY][OKAY] - -[NO] ....... [OKAY] -transformer ............ sparse_attn[NO] sparse_attn ....... sparse_attn........................[OKAY] -[NO]............ [NO] ....... [NO]....... stochastic_transformer [OKAY]....... - [OKAY]. -[OKAY] transformer -[NO] transformer............ transformer ................... [NO] [OKAY]............ [NO]....... - [NO]....... [OKAY] ....... -[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer. stochastic_transformer .[NO] [NO]........ ....... [OKAY] [NO] -[OKAY] -....... [OKAY] -ninjaninjaninjaninja .................................... .................. ..................[OKAY] [OKAY] -[OKAY] -[OKAY] --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- ---------------------------------------------------op name - -op name................op name op name ................................ installed ................ installedinstalled.. installed..compatible.. - ..compatible-------------------------------------------------- compatible - -compatible --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam cpu_adam.......cpu_adam............... [OKAY]...............[NO] -............... [NO].......[NO] .......[OKAY]....... - [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_adam .............fused_adam fused_lamb[NO]fused_adam .............................................. [NO][NO][OKAY][NO] - ..................... [OKAY][OKAY][OKAY] -fused_lamb - - ............. [NO] ....... fused_lambfused_lamb[OKAY] -.......................... sparse_attn[NO][NO] ................... ....... [NO] [OKAY] [OKAY] -....... - [OKAY] -sparse_attn transformer............ ............[NO] [NO].......sparse_attn sparse_attn [OKAY]....... ............ -............[OKAY] -[NO]transformer[NO] ..........................stochastic_transformer [OKAY][OKAY][NO] -. - .......transformertransformer[NO] ............[OKAY]................... - [NO][NO][OKAY] -.............. [OKAY][OKAY]stochastic_transformer - - . stochastic_transformer[NO]stochastic_transformer ........ .[OKAY] -[NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io async_io............... ............... [NO] [NO]....... .......[NO] -[NO] -transformer_inference ..transformer_inference [NO] ....... [OKAY] -.. [NO] .......utils .................. [NO] [OKAY]....... - [OKAY] -quantizer utils.............. ..................[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja .................................... ....................................[OKAY][OKAY] - -[OKAY][OKAY]-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op nameop name -op name op name ................ ................ ................................ installed installed installed.. ..installed compatible..compatible - - ..---------------------------------------------------------------------------------------------------- - -compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam ...............cpu_adam cpu_adam....... [NO] ......................[OKAY]............... - [NO][OKAY][NO] - .............. [OKAY][OKAY] - -fused_adam .............fused_adam .............fused_adamfused_adam[NO] [NO] ............. ............. ....... [OKAY][NO]....... -[NO] .............. fused_lamb[OKAY] -[OKAY] [OKAY] -.............fused_lamb - .............[NO] fused_lamb.......fused_lamb[NO] ..........................[OKAY] ....... -[NO] [NO]....... [OKAY].......[OKAY] - - [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attntransformersparse_attn sparse_attn ........................ [NO]............ ...................[NO] [NO] [OKAY][NO]....... ....... - [OKAY].......[OKAY] - -[OKAY]stochastic_transformer - transformer transformer............transformer. ............[NO]............[NO] [NO].......[NO] ..............[OKAY] - ....... [OKAY] [OKAY] - -[OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer . .. [NO] [NO] .............. [NO] [OKAY] [OKAY] -....... - [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -DeepSpeed general environment info: -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. ....................................[OKAY] - [OKAY][OKAY][OKAY]-------------------------------------------------- - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- -op name - - op nameop name op name................ ................ ................ installed................ installed ..installedinstalled compatible...... - --------------------------------------------------compatiblecompatiblecompatible - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [NO] cpu_adamcpu_adamcpu_adam....... ..............................[OKAY] ............... -[NO] [NO] [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_adamfused_adamfused_adamfused_lamb .................................................... [NO][NO][NO][NO] ....... ....... ....... .......[OKAY] [OKAY] - [OKAY] -[OKAY] -fused_lamb - fused_lamb............. fused_lamb[NO]............. ....................[NO] [OKAY]sparse_attn.......[NO] - [OKAY]................... - [NO][OKAY] ....... - [OKAY] -sparse_attn ............transformer sparse_attn[NO]............ ................... [NO]sparse_attn [OKAY][NO] -....... ............transformer[OKAY]....... - [NO][OKAY]............ - .......[NO]stochastic_transformer transformer [OKAY] ....... -. ............[OKAY]transformer [NO] - [NO]................... .......stochastic_transformer[OKAY][NO] - [OKAY]. -....... [NO] [OKAY]stochastic_transformer....... - [OKAY]stochastic_transformer -. [NO]. .......[NO] [OKAY]....... - [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -------------------------------------------------------------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja -JIT compiled ops requires ninja - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninja ninja ...................................................... ..................[OKAY][OKAY][OKAY] - - -[OKAY]-------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - - op nameop name................ op name ................installed................ .................. installedinstalled installedcompatible.... - --------------------------------------------------compatible..compatible - - ---------------------------------------------------compatible-------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam cpu_adam...................... cpu_adam [NO]............... [OKAY] - ......................[NO] [OKAY] [NO] -....... ....... [OKAY]fused_adam[OKAY] - -............. [NO] ....... [OKAY]fused_adam - ............. [NO] fused_lamb.......fused_adam fused_adam .............[OKAY] -..........................[NO] fused_lamb[NO]....... [NO] .................... [OKAY] - [NO].......[OKAY] -.......[OKAY] -[OKAY]fused_lamb - .............fused_lamb [NO]............. sparse_attn....... [NO]............[OKAY] -[NO]....... sparse_attn ...................[OKAY] -[NO][OKAY] -....... [OKAY] -transformersparse_attn transformer............ ............[NO] ............sparse_attn ....... [NO] [NO] ............ [OKAY]....... ....... -[NO][OKAY] - stochastic_transformer[OKAY]....... -stochastic_transformer [OKAY].transformer -. [NO]............[NO] transformer .............. [NO] ............[OKAY] [OKAY] - ....... -[NO] [OKAY]....... - [OKAY] -stochastic_transformer . stochastic_transformer[NO] ........ [NO][OKAY] -....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja -JIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninja ninja .................................... .................. .................. [OKAY] [OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name -op name op name ................................ ................ ................installedinstalledinstalled installed.... .. compatible..compatible - -compatiblecompatible-------------------------------------------------- --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... cpu_adamcpu_adam...............[NO] ...............[NO]...................... .......[NO][OKAY][NO] - [OKAY].............. - [OKAY][OKAY] - -fused_adam ............. fused_adam[NO] .................... [NO]fused_adamfused_adam[OKAY] -................................. [OKAY][NO]fused_lamb - [NO] ....... ............. fused_lamb[OKAY]....... - [NO].............[OKAY] -fused_lamb[NO]....... .............fused_lamb[OKAY]....... -[NO][OKAY]............. - .......[NO] [OKAY] -....... [OKAY] -sparse_attn ............sparse_attn [NO]............ .......[NO] [OKAY]sparse_attn....... - sparse_attn............[OKAY]transformer - [NO]............ ............ ....... transformer[NO][NO][OKAY] - .......................... transformer [NO] [OKAY][OKAY] ............ - - .......[NO] transformer.......[OKAY]stochastic_transformer -[OKAY]............. - stochastic_transformer[NO][NO] ............... stochastic_transformer[OKAY] [OKAY] [NO] - - ........ stochastic_transformer[NO][OKAY] -........ [OKAY][NO] - ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io async_io............... [NO]............... .......[NO] [NO]....... - [NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix0.5.5+57dee5a, 57dee5a, pp_deadlock_fix - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... transformer_inference[NO] ......... [NO][NO] -....... [OKAY] -async_ioutils .................. ...............[NO] transformer_inference ....... [NO] .. [OKAY] ....... -[NO] [NO]....... - [OKAY] -quantizer .............. [NO] utils....... ..................[OKAY] -[NO]transformer_inference ......... --------------------------------------------------[OKAY][NO] - - ....... [OKAY] -quantizer .............. utils[NO] ......................... [NO][OKAY] -....... [OKAY] ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] .......  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.[OKAY] - --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference utils.. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizerutils ................................ [NO][NO] .............. [OKAY][OKAY] - ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils .................. [NO]utils ......................... [OKAY][NO] - ....... quantizer[OKAY] - .............. [NO] .......quantizer [OKAY].............. - [NO] ....... [OKAY]-------------------------------------------------- - --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -DeepSpeed general environment info:nvcc version ..................... -11.2 -deepspeed install path torch install path........... ...............['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -deepspeed wheel compiled w. torch version...... ....................torch 1.8, cuda 11.1 -1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [NO] ....... [OKAY] -async_ioquantizer ............................. [NO][NO] .............. [OKAY][NO] - --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utils ..................utils [NO] ......................... [NO][OKAY] -....... [OKAY] -quantizer ..............quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.transformer_inference .. - [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -async_io ............... [NO]quantizer ..................... [NO] ....... [NO][OKAY] - --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] .......  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.[OKAY] - -utils .................. [NO] ....... [OKAY] -async_ioquantizer ............................. [NO][NO] .............. [OKAY][NO] - --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix0.5.5+57dee5a, 57dee5a, pp_deadlock_fix - -deepspeed wheel compiled w. deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -quantizer .............. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix0.5.5+57dee5a, 57dee5a, pp_deadlock_fix - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ...............DeepSpeed general environment info: 11.1 -nvcc version -..................... 11.2 -deepspeed install path torch install path........... ............... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']0.5.5+57dee5a, 57dee5a, pp_deadlock_fix - -deepspeed wheel compiled w.torch version ...... ....................torch 1.8, cuda 11.1 -1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix - [WARNING]  async_io: please install the libaio-devel package with yum -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch version - .................... torch version1.8.1 -.................... torch cuda version1.8.1 -............... torch cuda version11.1 -...............nvcc version 11.1..................... - nvcc version11.2 -.....................deepspeed install path 11.2........... - deepspeed install path ...........['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']................... - 0.5.5+57dee5a, 57dee5a, pp_deadlock_fixdeepspeed info - deepspeed wheel compiled w.................... ......0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2 -deepspeed install path deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -0.5.5+57dee5a, 57dee5a, pp_deadlock_fixdeepspeed wheel compiled w. -......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1DeepSpeed general environment info: - -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer .............. [NO]quantizer ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - - -async_ioasync_ioasync_io ............... ............... ............... [NO] [NO][NO] ..................... [NO][NO] -[NO] - -transformer_inferencetransformer_inference transformer_inference.... ..[NO][NO] [NO].............. .......[OKAY][OKAY] - -[OKAY] -utilsutils ....................................utils [NO][NO].................. ..............[NO] [OKAY][OKAY]....... - -[OKAY] -quantizerquantizer ..............quantizer.............. [NO]..............[NO] ....... [NO] ....... [OKAY] ....... -[OKAY] -[OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -transformer_inference .. [NO] ....... [OKAY] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2DeepSpeed general environment info: -deepspeed install path -........... torch install path['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - deepspeed info............... ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w.['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -...... torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -.................... torch version1.8.1 -.................... torch cuda version1.8.1 -............... 11.1torch cuda version - ...............nvcc version .....................11.1 -11.2nvcc version - deepspeed install path..................... ...........11.2 -deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']........... - deepspeed info ................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']0.5.5+57dee5a, 57dee5a, pp_deadlock_fix - -deepspeed infodeepspeed wheel compiled w. ......................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fixtorch 1.8, cuda 11.1 - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path -............... torch install path ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']............... - torch version .................... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']1.8.1 - -torch cuda versiontorch version ................................... 11.11.8.1 - -nvcc version .....................torch cuda version 11.2............... - deepspeed install path11.1 -...........nvcc version ..................... 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed install pathdeepspeed info .............................. 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed wheel compiled w. - deepspeed info...... ...................torch 1.8, cuda 11.1 -0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path -............... torch install path['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -............... torch version .................... 1.8.1['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch cuda version torch version............... ....................11.1 -1.8.1 -nvcc version .....................torch cuda version 11.2............... - deepspeed install path11.1 -...........nvcc version ..................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']11.2 - -deepspeed install pathdeepspeed info .............................. 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed wheel compiled w. - ......deepspeed info torch 1.8, cuda 11.1................... - 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1645299.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 55 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.02 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... False - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main ******** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** - -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY] - [OKAY][OKAY][OKAY]-------------------------------------------------- - - --------------------------------------------------- --------------------------------------------------- -op name-------------------------------------------------- - -op name................op nameop name ................................................installed installedinstalled.. installed .. ..compatiblecompatible -.. - --------------------------------------------------compatible--------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... ............... cpu_adam[NO]cpu_adam .......[NO]............... [OKAY] ............... -....... [NO] [NO] [OKAY] -.............. [OKAY][OKAY] - -fused_adam ............. [NO] .......fused_adam [OKAY]fused_adam.............fused_adam - .............[NO]fused_lamb ............. ....... [NO] [NO] [OKAY]............. - ....... ....... [NO] [OKAY] [OKAY] -....... -fused_lamb fused_lamb[OKAY].............fused_lamb - [NO].......................... .......[NO][NO] [OKAY] ....... - .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn transformersparse_attnsparse_attn ............ ............[NO]........................ [NO].......[NO] [NO] .......[OKAY] ....... -.......[OKAY] -[OKAY][OKAY]transformer - -............transformer stochastic_transformertransformer[NO]............ .................... [NO] [NO][NO] [OKAY] ....... - ..............[OKAY] -[OKAY][OKAY] - -stochastic_transformer stochastic_transformerstochastic_transformer . .[NO]. .......[NO][NO] [OKAY] ....... -....... [OKAY][OKAY] - - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -> setting tensorboard ... - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... transformer_inference[NO] -.. [NO] ....... [OKAY] -utils ..................transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -quantizer .............. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -**** Git info for Megatron: git_hash=df7a9d9 git_branch=main **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-21 21:27:53,188] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/data' ->>> done with dataset index builder. Compilation time: 0.313 seconds -WARNING: constraints for invoking optimized fused softmax kernel are not met. We default back to unfused kernel invocations. -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 5.317 seconds -time to initialize megatron (seconds): 62.895 -[after megatron is initialized] datetime: 2021-10-21 21:27:58 -building GPT model ... -[2021-10-21 21:27:58,952] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-21 21:27:58,953] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-21 21:27:58,953] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.11 GB, percent = 21.4% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-21 21:28:00,626] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - - - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - - - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - - - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - - - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - - - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -[2021-10-21 21:28:01,340] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-21 21:28:01,341] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-21 21:28:01,341] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.28 GB, percent = 21.5% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-21 21:28:01,342] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+57dee5a, git-hash=57dee5a, git-branch=pp_deadlock_fix -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-21 21:28:01,379] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-21 21:28:01,379] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-21 21:28:01,379] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-21 21:28:01,380] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-21 21:28:01,380] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-21 21:28:01,380] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-21 21:28:01,380] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-21 21:28:01,380] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-21 21:28:01,380] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-21 21:28:01,380] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -[1/2] c++ -MMD -MF flatten_unflatten.o.d -DTORCH_EXTENSION_NAME=utils -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/TH -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/THC -isystem /gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -c /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed/ops/csrc/utils/flatten_unflatten.cpp -o flatten_unflatten.o -[2/2] c++ flatten_unflatten.o -shared -L/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o utils.so -Loading extension module utils... -Time to load utils op: 12.890349864959717 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 12.95810055732727 secondsTime to load utils op: 12.957229137420654 secondsTime to load utils op: 12.963773250579834 seconds - - -Time to load utils op: 12.964766263961792 secondsTime to load utils op: 12.969484090805054 secondsTime to load utils op: 12.969359397888184 secondsTime to load utils op: 12.961370706558228 seconds - - - -Time to load utils op: 12.969290971755981 secondsTime to load utils op: 12.962694644927979 seconds - -Time to load utils op: 12.968739986419678 seconds -Time to load utils op: 12.961738348007202 seconds -Time to load utils op: 12.966982364654541 secondsTime to load utils op: 12.964924097061157 secondsTime to load utils op: 12.965541362762451 secondsTime to load utils op: 12.963887453079224 seconds - - - -Time to load utils op: 12.965399026870728 secondsTime to load utils op: 12.966296434402466 seconds - -Time to load utils op: 12.96236515045166 seconds -Time to load utils op: 12.965842485427856 seconds -Time to load utils op: 12.957144260406494 seconds -Time to load utils op: 12.957462310791016 secondsTime to load utils op: 12.957109928131104 seconds - -Time to load utils op: 12.954140663146973 seconds -Time to load utils op: 12.963305711746216 secondsTime to load utils op: 12.963874101638794 secondsTime to load utils op: 12.964272022247314 secondsTime to load utils op: 12.961817741394043 seconds - - - -Time to load utils op: 12.961796522140503 seconds -Time to load utils op: 12.96087646484375 secondsTime to load utils op: 12.962192296981812 seconds - -Time to load utils op: 12.961981773376465 seconds -Time to load utils op: 12.962405443191528 secondsTime to load utils op: 12.960282564163208 secondsTime to load utils op: 12.96167778968811 secondsTime to load utils op: 12.962266683578491 seconds - - - -Time to load utils op: 12.868578910827637 seconds -Time to load utils op: 12.86714792251587 seconds -Time to load utils op: 12.867297887802124 secondsTime to load utils op: 12.864776611328125 seconds - -Time to load utils op: 12.958306312561035 secondsTime to load utils op: 12.95703387260437 seconds -Time to load utils op: 12.958403587341309 seconds - -Time to load utils op: 12.958443641662598 seconds -Time to load utils op: 12.963791131973267 seconds -Time to load utils op: 12.965389490127563 seconds -Time to load utils op: 12.966509103775024 seconds -Time to load utils op: 12.967154502868652 seconds -Time to load utils op: 12.957504272460938 seconds -Time to load utils op: 12.957414865493774 seconds -Time to load utils op: 12.958162784576416 seconds -Time to load utils op: 12.955605030059814 seconds -Time to load utils op: 12.957389116287231 seconds -Time to load utils op: 12.953217506408691 secondsTime to load utils op: 12.957878351211548 seconds - -Time to load utils op: 12.956839561462402 seconds -Time to load utils op: 12.850239992141724 seconds -Time to load utils op: 12.871604442596436 secondsTime to load utils op: 12.872550010681152 seconds -Time to load utils op: 12.869450330734253 seconds - -Time to load utils op: 12.961687803268433 seconds -Time to load utils op: 12.963106870651245 seconds -Time to load utils op: 12.963704347610474 secondsTime to load utils op: 12.96425461769104 seconds - -Time to load utils op: 12.964396953582764 secondsTime to load utils op: 12.965296030044556 secondsTime to load utils op: 12.965505838394165 seconds -Time to load utils op: 12.965477228164673 seconds - - -Time to load utils op: 12.968895196914673 secondsTime to load utils op: 12.973491668701172 seconds - -Time to load utils op: 12.968560934066772 seconds -Time to load utils op: 12.967729330062866 seconds -Time to load utils op: 12.965198278427124 seconds -Time to load utils op: 12.965030670166016 secondsTime to load utils op: 12.96368956565857 secondsTime to load utils op: 12.965493440628052 seconds - - -Time to load utils op: 12.962815046310425 seconds -Time to load utils op: 12.959651947021484 seconds -Time to load utils op: 12.970352172851562 secondsTime to load utils op: 12.963162422180176 seconds - -Time to load utils op: 12.959879875183105 secondsTime to load utils op: 12.95913052558899 secondsTime to load utils op: 12.959542274475098 seconds - - -Time to load utils op: 12.957708358764648 seconds -Time to load utils op: 12.967376232147217 seconds -Time to load utils op: 12.965022563934326 secondsTime to load utils op: 12.967831373214722 secondsTime to load utils op: 12.966516971588135 secondsTime to load utils op: 12.966928243637085 seconds - - - -Time to load utils op: 12.968225955963135 seconds -Time to load utils op: 12.96767807006836 secondsTime to load utils op: 12.968275308609009 seconds - -Time to load utils op: 12.965970516204834 secondsTime to load utils op: 12.970263004302979 seconds - -Time to load utils op: 12.965487241744995 secondsTime to load utils op: 12.964370965957642 secondsTime to load utils op: 12.966172218322754 seconds - - -Time to load utils op: 12.96253490447998 secondsTime to load utils op: 12.963299751281738 secondsTime to load utils op: 12.961650609970093 seconds - -Time to load utils op: 12.966428279876709 secondsTime to load utils op: 12.962206840515137 seconds - -Time to load utils op: 12.965876579284668 seconds -Time to load utils op: 12.965754270553589 seconds - -Time to load utils op: 12.961627960205078 seconds -Time to load utils op: 12.971931219100952 seconds -Time to load utils op: 12.965678691864014 seconds -Time to load utils op: 12.965532302856445 seconds -Time to load utils op: 12.956801176071167 secondsTime to load utils op: 12.958102226257324 seconds - -Time to load utils op: 12.962058067321777 secondsTime to load utils op: 12.96275019645691 seconds -Time to load utils op: 12.962835550308228 seconds -Time to load utils op: 12.958291292190552 seconds -Time to load utils op: 12.95795750617981 seconds - -Time to load utils op: 12.959762573242188 secondsTime to load utils op: 12.959187746047974 seconds -Time to load utils op: 12.959293842315674 secondsTime to load utils op: 12.955705642700195 seconds -Time to load utils op: 12.962854862213135 seconds - - -Time to load utils op: 12.962584495544434 seconds -Time to load utils op: 12.961988925933838 seconds -Time to load utils op: 12.960957765579224 seconds -Time to load utils op: 12.962092399597168 seconds -Time to load utils op: 12.963632822036743 seconds -Time to load utils op: 12.973764419555664 seconds -Time to load utils op: 12.960137367248535 seconds -Time to load utils op: 12.973840475082397 seconds -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] - -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0022156238555908203 seconds -Time to load utils op: 0.0020639896392822266 seconds -Time to load utils op: 0.002123117446899414 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009915828704833984 secondsTime to load utils op: 0.0013589859008789062 seconds - -Time to load utils op: 0.001191854476928711 seconds -Time to load utils op: 0.00096893310546875 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012829303741455078 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001027822494506836 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009870529174804688 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012083053588867188 seconds -Time to load utils op: 0.0010330677032470703 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011725425720214844 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Time to load utils op: 0.00102996826171875 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.0013265609741210938 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012252330780029297 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012824535369873047 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011508464813232422 seconds -Time to load utils op: 0.0009663105010986328 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009777545928955078 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010182857513427734 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010037422180175781 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009372234344482422 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011754035949707031 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0014369487762451172 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.001209259033203125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001102447509765625 seconds - -Loading extension module utils... -Time to load utils op: 0.0014262199401855469 seconds -Time to load utils op: 0.0009341239929199219 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Time to load utils op: 0.0011372566223144531 seconds -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.0009031295776367188 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Time to load utils op: 0.00139617919921875 secondsTime to load utils op: 0.001287698745727539 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011546611785888672 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001371145248413086 seconds -Time to load utils op: 0.0012905597686767578 seconds -Time to load utils op: 0.0011281967163085938 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012793540954589844 seconds -Time to load utils op: 0.0012006759643554688 secondsTime to load utils op: 0.0013451576232910156 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.0010671615600585938 seconds -Time to load utils op: 0.000934600830078125 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012030601501464844 secondsTime to load utils op: 0.0010116100311279297 seconds - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012400150299072266 seconds -Time to load utils op: 0.0010917186737060547 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.001117706298828125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011112689971923828 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0009090900421142578 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012853145599365234 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0013110637664794922 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0012106895446777344 seconds -Time to load utils op: 0.0010652542114257812 seconds -Loading extension module utils... -Time to load utils op: 0.0012142658233642578 seconds -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0014040470123291016 seconds -Time to load utils op: 0.0009138584136962891 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010476112365722656 seconds -Time to load utils op: 0.00101470947265625 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013113021850585938 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.0012836456298828125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011701583862304688 seconds -Time to load utils op: 0.0014300346374511719 seconds -Time to load utils op: 0.0012028217315673828 seconds -Time to load utils op: 0.0009922981262207031 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011448860168457031 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.001007080078125 secondsTime to load utils op: 0.0010521411895751953 seconds - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011069774627685547 seconds -Time to load utils op: 0.001329183578491211 seconds -Time to load utils op: 0.0009720325469970703 seconds -Time to load utils op: 0.0013270378112792969 seconds -Time to load utils op: 0.0010533332824707031 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012021064758300781 seconds -Time to load utils op: 0.0010991096496582031 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010781288146972656 seconds -Time to load utils op: 0.0011827945709228516 seconds -Loading extension module utils... -Time to load utils op: 0.0012593269348144531 seconds -Time to load utils op: 0.0013196468353271484 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011854171752929688 seconds -Time to load utils op: 0.001041412353515625 seconds -Time to load utils op: 0.00118255615234375 seconds -Time to load utils op: 0.0010650157928466797 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012547969818115234 seconds -Time to load utils op: 0.0012295246124267578 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Loading extension module utils... -Time to load utils op: 0.0014696121215820312 seconds -Time to load utils op: 0.0010030269622802734 seconds -Time to load utils op: 0.0014066696166992188 seconds -Time to load utils op: 0.0009477138519287109 seconds -Time to load utils op: 0.0011942386627197266 secondsTime to load utils op: 0.0013849735260009766 seconds - -Time to load utils op: 0.0009655952453613281 seconds -Time to load utils op: 0.0014467239379882812 seconds -Time to load utils op: 0.0010600090026855469 seconds -Time to load utils op: 0.0011093616485595703 seconds -Time to load utils op: 0.0015153884887695312 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013065338134765625 seconds -Time to load utils op: 0.0012111663818359375 seconds -Time to load utils op: 0.0015406608581542969 seconds -Time to load utils op: 0.0013866424560546875 seconds -Time to load utils op: 0.0012335777282714844 seconds -Time to load utils op: 0.0014047622680664062 seconds -Time to load utils op: 0.0013852119445800781 seconds -Time to load utils op: 0.0014393329620361328 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0018761157989501953 seconds -Time to load utils op: 0.0017096996307373047 seconds -Time to load utils op: 0.0019643306732177734 seconds -Time to load utils op: 0.0016758441925048828 seconds -Time to load utils op: 0.0018913745880126953 seconds -Time to load utils op: 0.0017914772033691406 seconds -Time to load utils op: 0.0020761489868164062 seconds -Time to load utils op: 0.0019230842590332031 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013082027435302734 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0013396739959716797 seconds -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012700557708740234 seconds -Time to load utils op: 0.0012965202331542969 seconds -Time to load utils op: 0.0012645721435546875 seconds -Time to load utils op: 0.0013267993927001953 seconds -Time to load utils op: 0.0012598037719726562 seconds -Time to load utils op: 0.00138092041015625 seconds -Time to load utils op: 0.0012774467468261719 secondsTime to load utils op: 0.0012211799621582031 seconds - -Time to load utils op: 0.0013713836669921875 seconds -Time to load utils op: 0.0012657642364501953 seconds -[2021-10-21 21:28:16,324] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -[2021-10-21 21:28:16,324] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-21 21:28:16,324] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.3 GB, percent = 21.5% -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010251998901367188 seconds -Time to load utils op: 0.0013473033905029297 seconds -Time to load utils op: 0.0012462139129638672 seconds -Time to load utils op: 0.0012161731719970703 seconds -[2021-10-21 21:28:16,378] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-21 21:28:16,379] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-21 21:28:16,379] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.3 GB, percent = 21.5% -[2021-10-21 21:28:16,379] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-21 21:28:16,414] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-21 21:28:16,414] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-21 21:28:16,415] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.3 GB, percent = 21.5% -[2021-10-21 21:28:16,415] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-21 21:28:16,415] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-21 21:28:16,415] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-21 21:28:16,415] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-21 21:28:16,415] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-21 21:28:16,415] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-21 21:28:16,415] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-21 21:28:16,415] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-21 21:28:16,415] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-21 21:28:16,415] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-21 21:28:16,416] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-21 21:28:16,417] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-21 21:28:16,418] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0007581710815429688 seconds -[2021-10-21 21:28:16,418] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,810] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-21 21:28:16,897] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,897] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,897] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,897] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,897] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,897] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,897] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,897] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -WARNING: could not find the metadata file /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - will not load any checkpoints and will start from random -[2021-10-21 21:28:16,901] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,901] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,901] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,901] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,902] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,903] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,904] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-21 21:28:16,905] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -time (ms) | load-checkpoint: 8.61 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -estimated model parameters: 125.2213504 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-conglongli/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 125.22432 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-21 21:28:16 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 3000320 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.363446 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.230 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.164 seconds - total number of samples: 6927161 - total number of epochs: 1 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.043 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-21 21:28:23 -done with setup ... -training ... -Number of parameters: 125.2213504 billionNumber of parameters: 125.2213504 billion - -time (ms) | model-and-optimizer-setup: 18012.40 | train/valid/test-data-iterators-setup: 5643.16 -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billionNumber of parameters: 125.22432 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billionNumber of parameters without embeddings: 103.368064 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-21 21:28:23 -[2021-10-21 21:28:23,393] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-21 21:28:23,393] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-21 21:28:23,393] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-21 21:28:23,393] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-21 21:28:23,393] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -[Rank 1] (after 1 iterations) memory (MB) | allocated: 13202.67822265625 | max allocated: 20666.22705078125 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 125] (after 1 iterations) memory (MB) | allocated: 13082.60107421875 | max allocated: 20546.20703125 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 5] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 9] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 13] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 17] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 25] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 29] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 33] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 21] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 0] (after 1 iterations) memory (MB) | allocated: 13203.03955078125 | max allocated: 20666.58837890625 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 8] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 124] (after 1 iterations) memory (MB) | allocated: 13082.369140625 | max allocated: 20545.97509765625 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 4] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 12] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 24] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 45] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 16] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 20] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 49] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 41] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 32] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 53] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 28] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 61] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 37] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 36] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 40] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 57] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 65] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 44] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 73] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 69] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 48] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 81] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 77] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 56] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 85] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 76] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 68] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 64] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 89] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 93] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 88] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 84] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 72] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 97] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 80] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 96] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 92] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 52] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 60] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 126] (after 1 iterations) memory (MB) | allocated: 13082.369140625 | max allocated: 20545.97509765625 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 2] (after 1 iterations) memory (MB) | allocated: 13202.06298828125 | max allocated: 20665.61181640625 | reserved: 24442.0 | max reserved: 24442.0[Rank 3] (after 1 iterations) memory (MB) | allocated: 13203.30322265625 | max allocated: 20666.85205078125 | reserved: 24442.0 | max reserved: 24442.0 - -[Rank 10] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 7] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 11] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 6] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 14] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 15] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 19] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 22] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 100] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 23] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 105] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 18] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 112] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16948.21923828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 108] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 113] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 30] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 109] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 31] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 116] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 120] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 101] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 117] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 26] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 27] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 121] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 35] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0[Rank 34] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 - -[Rank 38] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 39] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 47] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0[Rank 46] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 - -[Rank 50] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 104] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 42] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 43] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 51] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 55] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 58] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 54] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 59] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 63] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 62] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 66] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 71] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 70] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 67] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 75] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 74] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 79] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 78] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 83] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 87] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 82] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 86] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 91] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 95] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 94] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 90] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 99] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 98] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 103] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 102] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 106] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 107] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 111] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 110] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 114] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 119] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 115] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 123] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 118] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 122] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 - iteration 1/ 292968 | consumed samples: 2048 | consumed tokens: 131072 | elapsed time per iteration (ms): 204975.6 | learning rate: 5.680E-07 | global batch size: 2048 | lm loss: 1.316407E+01 | loss scale: 4096.0 | grad norm: 224806.780 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -[Rank 127] (after 1 iterations) memory (MB) | allocated: 13082.68505859375 | max allocated: 20546.291015625 | reserved: 24406.0 | max reserved: 24406.0 -time (ms) - iteration 2/ 292968 | consumed samples: 4096 | consumed tokens: 262144 | elapsed time per iteration (ms): 126852.5 | learning rate: 1.136E-06 | global batch size: 2048 | lm loss: 1.315916E+01 | loss scale: 4096.0 | grad norm: 225244.360 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3/ 292968 | consumed samples: 6144 | consumed tokens: 393216 | elapsed time per iteration (ms): 116457.3 | learning rate: 1.704E-06 | global batch size: 2048 | lm loss: 2.324803E+01 | loss scale: 4096.0 | grad norm: 1381761.459 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 4/ 292968 | consumed samples: 8192 | consumed tokens: 524288 | elapsed time per iteration (ms): 112171.3 | learning rate: 2.272E-06 | global batch size: 2048 | lm loss: 3.475053E+01 | loss scale: 4096.0 | grad norm: 1845285.271 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 5/ 292968 | consumed samples: 10240 | consumed tokens: 655360 | elapsed time per iteration (ms): 102880.2 | learning rate: 2.840E-06 | global batch size: 2048 | lm loss: 3.745642E+01 | loss scale: 4096.0 | grad norm: 1436900.964 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 6/ 292968 | consumed samples: 12288 | consumed tokens: 786432 | elapsed time per iteration (ms): 102783.6 | learning rate: 3.408E-06 | global batch size: 2048 | lm loss: 3.983621E+01 | loss scale: 4096.0 | grad norm: 1067945.196 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 7/ 292968 | consumed samples: 14336 | consumed tokens: 917504 | elapsed time per iteration (ms): 95986.7 | learning rate: 3.976E-06 | global batch size: 2048 | lm loss: 3.536437E+01 | loss scale: 4096.0 | grad norm: 1080819.724 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 8/ 292968 | consumed samples: 16384 | consumed tokens: 1048576 | elapsed time per iteration (ms): 92557.1 | learning rate: 4.544E-06 | global batch size: 2048 | lm loss: 3.412041E+01 | loss scale: 4096.0 | grad norm: 1023567.591 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 9/ 292968 | consumed samples: 18432 | consumed tokens: 1179648 | elapsed time per iteration (ms): 91935.4 | learning rate: 5.112E-06 | global batch size: 2048 | lm loss: 3.219579E+01 | loss scale: 4096.0 | grad norm: 654723.072 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 10/ 292968 | consumed samples: 20480 | consumed tokens: 1310720 | elapsed time per iteration (ms): 90080.9 | learning rate: 5.680E-06 | global batch size: 2048 | lm loss: 2.971920E+01 | loss scale: 4096.0 | grad norm: 537991.005 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 11/ 292968 | consumed samples: 22528 | consumed tokens: 1441792 | elapsed time per iteration (ms): 88691.3 | learning rate: 6.249E-06 | global batch size: 2048 | lm loss: 2.729292E+01 | loss scale: 4096.0 | grad norm: 424745.696 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 12/ 292968 | consumed samples: 24576 | consumed tokens: 1572864 | elapsed time per iteration (ms): 88398.6 | learning rate: 6.817E-06 | global batch size: 2048 | lm loss: 2.790564E+01 | loss scale: 4096.0 | grad norm: 644211.527 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 13/ 292968 | consumed samples: 26624 | consumed tokens: 1703936 | elapsed time per iteration (ms): 88502.3 | learning rate: 7.385E-06 | global batch size: 2048 | lm loss: 2.526423E+01 | loss scale: 4096.0 | grad norm: 454067.335 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 14/ 292968 | consumed samples: 28672 | consumed tokens: 1835008 | elapsed time per iteration (ms): 87733.4 | learning rate: 7.953E-06 | global batch size: 2048 | lm loss: 2.331569E+01 | loss scale: 4096.0 | grad norm: 276743.182 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 15/ 292968 | consumed samples: 30720 | consumed tokens: 1966080 | elapsed time per iteration (ms): 86247.0 | learning rate: 8.521E-06 | global batch size: 2048 | lm loss: 2.094402E+01 | loss scale: 4096.0 | grad norm: 226314.869 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 16/ 292968 | consumed samples: 32768 | consumed tokens: 2097152 | elapsed time per iteration (ms): 86013.9 | learning rate: 9.089E-06 | global batch size: 2048 | lm loss: 1.969643E+01 | loss scale: 4096.0 | grad norm: 135309.147 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 17/ 292968 | consumed samples: 34816 | consumed tokens: 2228224 | elapsed time per iteration (ms): 86000.3 | learning rate: 9.657E-06 | global batch size: 2048 | lm loss: 1.816238E+01 | loss scale: 4096.0 | grad norm: 74699.814 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 18/ 292968 | consumed samples: 36864 | consumed tokens: 2359296 | elapsed time per iteration (ms): 85741.8 | learning rate: 1.022E-05 | global batch size: 2048 | lm loss: 1.715309E+01 | loss scale: 4096.0 | grad norm: 43055.680 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 19/ 292968 | consumed samples: 38912 | consumed tokens: 2490368 | elapsed time per iteration (ms): 86363.7 | learning rate: 1.079E-05 | global batch size: 2048 | lm loss: 1.587515E+01 | loss scale: 4096.0 | grad norm: 40328.680 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 20/ 292968 | consumed samples: 40960 | consumed tokens: 2621440 | elapsed time per iteration (ms): 87039.7 | learning rate: 1.136E-05 | global batch size: 2048 | lm loss: 1.445321E+01 | loss scale: 4096.0 | grad norm: 178516.421 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 21/ 292968 | consumed samples: 43008 | consumed tokens: 2752512 | elapsed time per iteration (ms): 86563.9 | learning rate: 1.193E-05 | global batch size: 2048 | lm loss: 1.723314E+01 | loss scale: 4096.0 | grad norm: 467676.180 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 22/ 292968 | consumed samples: 45056 | consumed tokens: 2883584 | elapsed time per iteration (ms): 86929.8 | learning rate: 1.250E-05 | global batch size: 2048 | lm loss: 1.384353E+01 | loss scale: 4096.0 | grad norm: 349625.568 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 23/ 292968 | consumed samples: 47104 | consumed tokens: 3014656 | elapsed time per iteration (ms): 86274.0 | learning rate: 1.307E-05 | global batch size: 2048 | lm loss: 1.433385E+01 | loss scale: 4096.0 | grad norm: 295627.439 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 24/ 292968 | consumed samples: 49152 | consumed tokens: 3145728 | elapsed time per iteration (ms): 87804.9 | learning rate: 1.363E-05 | global batch size: 2048 | lm loss: 1.566444E+01 | loss scale: 4096.0 | grad norm: 426731.939 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 25/ 292968 | consumed samples: 51200 | consumed tokens: 3276800 | elapsed time per iteration (ms): 86109.0 | learning rate: 1.420E-05 | global batch size: 2048 | lm loss: 1.351891E+01 | loss scale: 4096.0 | grad norm: 214665.644 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 26/ 292968 | consumed samples: 53248 | consumed tokens: 3407872 | elapsed time per iteration (ms): 86387.3 | learning rate: 1.477E-05 | global batch size: 2048 | lm loss: 1.299350E+01 | loss scale: 4096.0 | grad norm: 196219.543 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 27/ 292968 | consumed samples: 55296 | consumed tokens: 3538944 | elapsed time per iteration (ms): 85245.0 | learning rate: 1.534E-05 | global batch size: 2048 | lm loss: 1.253081E+01 | loss scale: 4096.0 | grad norm: 40435.746 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 28/ 292968 | consumed samples: 57344 | consumed tokens: 3670016 | elapsed time per iteration (ms): 86509.8 | learning rate: 1.591E-05 | global batch size: 2048 | lm loss: 1.233641E+01 | loss scale: 4096.0 | grad norm: 59434.881 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 29/ 292968 | consumed samples: 59392 | consumed tokens: 3801088 | elapsed time per iteration (ms): 86102.6 | learning rate: 1.647E-05 | global batch size: 2048 | lm loss: 1.230502E+01 | loss scale: 4096.0 | grad norm: 83241.888 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 30/ 292968 | consumed samples: 61440 | consumed tokens: 3932160 | elapsed time per iteration (ms): 85456.0 | learning rate: 1.704E-05 | global batch size: 2048 | lm loss: 1.178389E+01 | loss scale: 4096.0 | grad norm: 34948.162 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 31/ 292968 | consumed samples: 63488 | consumed tokens: 4063232 | elapsed time per iteration (ms): 86188.5 | learning rate: 1.761E-05 | global batch size: 2048 | lm loss: 1.131446E+01 | loss scale: 4096.0 | grad norm: 33246.558 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 32/ 292968 | consumed samples: 65536 | consumed tokens: 4194304 | elapsed time per iteration (ms): 85866.1 | learning rate: 1.818E-05 | global batch size: 2048 | lm loss: 1.087723E+01 | loss scale: 4096.0 | grad norm: 62673.048 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 33/ 292968 | consumed samples: 67584 | consumed tokens: 4325376 | elapsed time per iteration (ms): 85043.8 | learning rate: 1.875E-05 | global batch size: 2048 | lm loss: 1.036173E+01 | loss scale: 4096.0 | grad norm: 53524.152 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 34/ 292968 | consumed samples: 69632 | consumed tokens: 4456448 | elapsed time per iteration (ms): 84939.6 | learning rate: 1.931E-05 | global batch size: 2048 | lm loss: 9.918847E+00 | loss scale: 4096.0 | grad norm: 59973.909 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -saving checkpoint at iteration 34 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-21 22:21:59,159] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/mp_rank_00_model_states.pt -[2021-10-21 22:21:59,294] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/mp_rank_01_model_states.pt -[2021-10-21 22:22:12,192] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-21 22:22:12,192] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-21 22:22:12,200] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-21 22:22:12,246] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-21 22:22:12,278] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-21 22:22:12,289] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-21 22:22:12,319] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-21 22:22:12,327] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-21 22:22:12,331] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-21 22:22:12,359] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-21 22:22:12,399] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-21 22:22:12,407] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-21 22:22:12,408] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-21 22:22:12,434] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-21 22:22:12,457] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-21 22:22:12,466] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-21 22:22:12,473] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-21 22:22:12,475] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-21 22:22:12,527] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-21 22:22:12,545] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-21 22:22:12,551] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-21 22:22:12,631] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-21 22:22:12,719] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-21 22:22:12,815] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-21 22:22:12,836] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-21 22:22:12,847] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-21 22:22:12,884] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-21 22:22:12,955] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-21 22:22:13,159] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-21 22:22:13,261] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-21 22:22:13,280] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-21 22:22:13,292] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-21 22:22:13,305] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-21 22:22:13,315] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-21 22:22:13,317] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-21 22:22:13,359] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-21 22:22:13,386] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-21 22:22:13,408] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-21 22:22:13,423] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-21 22:22:13,435] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-21 22:22:13,440] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-21 22:22:13,459] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-21 22:22:13,460] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-21 22:22:13,463] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-21 22:22:13,467] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-21 22:22:13,470] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-21 22:22:13,484] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-21 22:22:13,490] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-21 22:22:13,502] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-21 22:22:13,512] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-21 22:22:13,517] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-21 22:22:13,523] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-21 22:22:13,543] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-21 22:22:13,553] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-21 22:22:13,559] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-21 22:22:13,560] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-21 22:22:13,573] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-21 22:22:13,580] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-21 22:22:13,580] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-21 22:22:13,597] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-21 22:22:13,623] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-21 22:22:13,668] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-21 22:22:13,682] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-21 22:22:13,709] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-21 22:22:13,712] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-21 22:22:13,755] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-21 22:22:13,763] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-21 22:22:13,769] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-21 22:22:13,793] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-21 22:22:13,803] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-21 22:22:13,831] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-21 22:22:13,840] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-21 22:22:13,870] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-21 22:22:13,899] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-21 22:22:13,901] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-21 22:22:13,930] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-21 22:22:13,935] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-21 22:22:13,960] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-21 22:22:13,998] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-21 22:22:14,061] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-21 22:22:14,130] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-21 22:22:14,152] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-21 22:22:14,160] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-21 22:22:14,205] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-21 22:22:14,252] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-21 22:22:14,288] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-21 22:22:14,298] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-21 22:22:14,346] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-21 22:22:14,383] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-21 22:22:14,395] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-21 22:22:14,561] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-21 22:22:14,567] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-21 22:22:14,591] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-21 22:22:14,890] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-21 22:22:14,983] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-21 22:22:15,121] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-21 22:22:16,139] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-21 22:22:16,337] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-21 22:22:16,391] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-21 22:22:16,448] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-21 22:22:16,565] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-21 22:22:16,772] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-21 22:22:17,116] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-21 22:22:17,747] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-21 22:22:18,065] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-21 22:22:19,476] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-21 22:22:20,354] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-21 22:22:21,091] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-21 22:22:21,140] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-21 22:22:21,724] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-21 22:22:21,977] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-21 22:22:22,434] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-21 22:22:22,686] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-21 22:22:23,042] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-21 22:22:23,044] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-21 22:22:23,507] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-21 22:22:23,687] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-21 22:22:23,710] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-21 22:22:24,198] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-21 22:22:24,498] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-21 22:22:24,738] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-21 22:22:24,759] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-21 22:22:25,058] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-21 22:22:25,150] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-21 22:22:25,474] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-21 22:22:25,855] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-21 22:22:26,106] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-21 22:22:26,865] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step34/zero_pp_rank_0_mp_rank_111_optim_states.pt - successfully saved checkpoint at iteration 34 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 30665.91 -[exiting program after 55.0033370534579 minutes] datetime: 2021-10-21 22:22:26 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaJIT compiled ops requires ninja - - - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY]ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] - .................. [OKAY] --------------------------------------------------- -fused_adam ............. [NO] ....... [OKAY] -op name ................fused_adam installed ............... compatible[NO] -op name ................ installed ..ninja ninja compatible -fused_lamb ............. ninja[NO]ninja .......ninja .................. .................. [OKAY] - .......-------------------------------------------------- -[OKAY]ninja -.................. ninja.................. [OKAY] -.................. [OKAY][OKAY][OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - .................. ninja[OKAY]fused_lambcpu_adam -............. ..................--------------------------------------------------............... --------------------------------------------------- -------------------------------------------------- -[OKAY].................. - - [OKAY]op name-------------------------------------------------- - -................-------------------------------------------------- cpu_adamop name installed -op nameop nameop name ................................................ installedinstalledinstalledsparse_attn .... .............. compatible compatible -[NO] - [NO][NO][OKAY]op name - .............................. --------------------------------------------------[OKAY]installed - - [OKAY]..op name - ............... op name................ .. ................ compatible[NO]installed -compatible ---------------------------------------------------------------------------------------------------- -....... - - --------------------------------------------------[OKAY] - - compatible................ - --------------------------------------------------installed -installed -------------------------------------------------- .. - fused_adam.. .............compatible -sparse_attn[NO]--------------------------------------------------cpu_adam ............ -.. compatible compatible -....... -transformer ............cpu_adam cpu_adam [NO]cpu_adam............... ......................[NO] ...............[NO] [OKAY].......[NO]....... - [OKAY][OKAY]....... - -...................... [OKAY][NO] -----------------------------------------------------------------------------------------------------cpu_adam - - [OKAY]............... - stochastic_transformer[OKAY] -[NO] ..............cpu_adamfused_lamb ............... [OKAY][OKAY].............[NO] - - [NO]cpu_adam cpu_adam...................... ...............[OKAY][NO] -. [NO] .......fused_adam [OKAY]fused_adam............. - fused_adam.............[NO] .............[NO]....... [NO].......[OKAY] - [NO]....... transformer....... [OKAY]............[OKAY] - -[NO]....... ....... [OKAY][OKAY] - -[OKAY]....... - [OKAY] -[NO] .......fused_adam .............[OKAY] -[NO] fused_adam....... stochastic_transformer[OKAY]............. -fused_adamfused_adam ............. .............[NO] fused_adam....... [OKAY][NO]fused_adam -fused_lamb fused_lamb............. fused_lamb ............. [NO]............. [NO].......[NO] .......[OKAY]....... - [OKAY][OKAY] - -sparse_attn . [NO] ............ [NO] fused_lamb.......[NO] ....................[OKAY]....... - ............. .............fused_lamb[NO] .......[NO].................... [OKAY][NO]....... -sparse_attn ............sparse_attn [NO]............sparse_attn ...................[NO] [OKAY][NO]....... - [OKAY][NO][OKAY] - -fused_lamb....... transformer[OKAY] - [OKAY] -.......[OKAY] -fused_lamb[OKAY] - .......[OKAY]transformer - [OKAY]............ -......................... [NO][NO] .............. [OKAY][OKAY] - -............. fused_lambfused_lamb[NO] ................................. [OKAY][NO]sparse_attn .......[NO] - transformer[NO] transformer ............ ....... ............ [NO] [OKAY] [NO] -stochastic_transformersparse_attn ............. [NO][NO] .............. [OKAY][OKAY]sparse_attn - - ............[OKAY] -[NO]....... .......[OKAY] -....... .......[OKAY] -stochastic_transformer[OKAY] - ............transformer [NO]............ [NO]....... .......[OKAY] [OKAY] - -[OKAY]sparse_attn -.stochastic_transformer [NO]stochastic_transformer ........ [NO].[OKAY] - .......[NO] [OKAY]....... - [OKAY] -transformer stochastic_transformer............ [NO]. .......[NO] .......[OKAY] -[OKAY] - ............ sparse_attn[NO]transformer ............................... [OKAY][NO][NO] -sparse_attn .............. [OKAY]transformer[OKAY] -stochastic_transformer . [NO] ....... [OKAY] - -........................transformer [NO] stochastic_transformer ............ .......[NO] [NO]........[OKAY] -[OKAY] ....... -[NO] [OKAY]transformer.......stochastic_transformer - ............stochastic_transformer[OKAY]. -[NO] .[NO]....... [NO].......[OKAY] -....... [OKAY][OKAY] - -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................ ................installed ..installed compatible.. - --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam ....... ...............[OKAY] -[NO] ....... [OKAY] -fused_adamfused_adam .......................... [NO] [NO]....... .......[OKAY] -[OKAY] -fused_lambfused_lamb .......................... [NO] [NO]....... .......[OKAY] - [OKAY] -sparse_attn sparse_attn............ [NO]............ .......[NO] [OKAY]....... - [OKAY]transformer - ............ transformer[NO] ................... [OKAY][NO] - ....... [OKAY] -stochastic_transformer . stochastic_transformer[NO] ....... .[OKAY] - [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. [OKAY] -cpu_adam ...............-------------------------------------------------- -[NO] op name.......ninja ................[OKAY] -..................installed [OKAY].. - compatible ----------------------------------------------------------------------------------------------------- - -op name ................ninja fused_adaminstalled .................................cpu_adam ...............[OKAY]compatible [NO] - - [NO].......-------------------------------------------------- -------------------------------------------------- ....... - -[OKAY] -op name[OKAY] -................ installedfused_lambcpu_adam .............................. compatible[NO] -[NO] --------------------------------------------------fused_adam....... ....... - ............. [OKAY] [OKAY] -[NO] - ....... [OKAY] -cpu_adam fused_lamb............... [NO]fused_adam............. .............sparse_attn[NO]....... ............[NO] .......[NO][OKAY] -.......[OKAY] ....... -[OKAY] -[OKAY] -fused_lambtransformer ......................... [NO]fused_adam[NO] ...........................sparse_attn [OKAY] [OKAY][NO]............ - - .......[NO] stochastic_transformer [OKAY] ....... - .[OKAY] -[NO]fused_lamb sparse_attntransformer ....... ......................... ............ [OKAY] [NO][NO] -[NO] ..................... [OKAY] -[OKAY][OKAY] - -stochastic_transformertransformer ............ .[NO] .......[NO] [OKAY]....... - [OKAY] -sparse_attn stochastic_transformer............ [NO]. ....... [NO][OKAY] -....... transformer[OKAY] -............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -............... [NO] ....... [NO] -async_io ............... transformer_inference[NO] ......... [NO][NO] -....... [OKAY] -utils .................. [NO] ....... transformer_inference[OKAY] -.. [NO] quantizer....... ..............[OKAY] -[NO] ....... utils[OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -.................. [NO] .......-------------------------------------------------- -[OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utilstransformer_inference .................... [NO][NO] ....... .......[OKAY] -[OKAY] -quantizer .............. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] [WARNING]  async_io: please install the libaio-devel package with yum - -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ...............DeepSpeed general environment info: [NO] ....... - [NO] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']transformer_inference - .. [NO]torch version .................... 1.8.1....... - [OKAY]torch cuda version - ............... 11.1 -utils nvcc version.................. .....................[NO] .......11.2 -[OKAY] -deepspeed install path ...........quantizer .............. [NO]['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -....... deepspeed info[OKAY] - ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w.-------------------------------------------------- -...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix0.5.5+57dee5a, 57dee5a, pp_deadlock_fix - -deepspeed wheel compiled w.deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................torch version 1.8.1 -.................... 1.8.1torch cuda version - ...............torch cuda version 11.1............... - nvcc version11.1 -..................... nvcc version11.2 -..................... deepspeed install path11.2 -........... deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -................... deepspeed info0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -................... deepspeed wheel compiled w.0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -...... deepspeed wheel compiled w.torch 1.8, cuda 11.1 -...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path DeepSpeed general environment info:........... -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info torch install path................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -...............deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [NO] - -transformer_inferenceasync_io ................. [NO][NO] .............. [OKAY][NO] - -utils .................. [NO] ....... [OKAY] -transformer_inferencequantizer ................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path ...............torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................torch version 1.8.1.................... - 1.8.1torch cuda version - ............... torch cuda version11.1 -...............nvcc version 11.1..................... - nvcc version11.2 -.....................deepspeed install path 11.2........... -deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']........... -deepspeed info ...................['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed info deepspeed wheel compiled w.................... ......0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch install path - ............... torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch cuda version - ............... 11.1torch version - nvcc version.................... .....................1.8.1 -11.2 -torch cuda versiondeepspeed install path .......................... 11.1 -nvcc version['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -.....................deepspeed info 11.2................... - deepspeed install path0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -...........deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main ******** Git info for Megatron: git_hash=829cefd git_branch=main **** - -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main ******** Git info for Megatron: git_hash=829cefd git_branch=main **** - -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main ******** Git info for Megatron: git_hash=829cefd git_branch=main **** - -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - - --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -op nameop name op nameop name ................ ................ ................................installed installedinstalledinstalled.. .. ..compatible .. -compatible compatible -compatible-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -cpu_adam cpu_adamcpu_adam............... ...............cpu_adam...............[NO] [NO]...............[NO]....... .......[NO].......[OKAY] - .......[OKAY][OKAY] - -[OKAY] -fused_adam .............fused_adamfused_adam [NO]fused_adam ............. .................... ............. [NO] [NO][OKAY] [NO] -....... ....... ....... [OKAY] fused_lamb[OKAY][OKAY] - - -............. [NO] fused_lamb.......fused_lamb fused_lamb ............. [OKAY]............. ............. - [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO] .......sparse_attn [OKAY]sparse_attnsparse_attn............ - ........................[NO] transformer[NO] [NO] ................... ....... .......[NO] [OKAY] [OKAY] [OKAY] - -....... - [OKAY]transformer - transformer............transformer stochastic_transformer........................[NO] [NO][NO] ........ ....... .......[OKAY][NO] -[OKAY] [OKAY] -....... - [OKAY] -stochastic_transformerstochastic_transformerstochastic_transformer ... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------JIT compiled ops requires ninja--------------------------------------------------JIT compiled ops requires ninja - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report - - -------------------------------------------------------------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op name op nameop nameop name................ ................................installed................ installed installedinstalled .... ..compatiblecompatible.. - ---------------------------------------------------compatiblecompatible-------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adamcpu_adam...............cpu_adam ............... ...............[NO][NO]............... [NO].......[NO] ....... ....... [OKAY]....... [OKAY] [OKAY] - -[OKAY] - -fused_adamfused_adamfused_adam fused_adam .......................... ............. ............. [NO][NO] [NO] [NO] ....... .............. ....... [OKAY][OKAY] [OKAY] - - -[OKAY] -fused_lambfused_lamb fused_lamb .............fused_lamb ............. ............. [NO] [NO]............. [NO] ....... ....... [NO] [OKAY]....... - [OKAY] ....... -[OKAY] -[OKAY] -sparse_attn ............ [NO] sparse_attnsparse_attn.......sparse_attn ............ ........................ [OKAY] [NO] [NO] -[NO] .......transformer....... ....... ............[OKAY][OKAY][OKAY] - -[NO] - .......transformer transformertransformer[OKAY]............ -[NO]........................ .......[NO][NO]stochastic_transformer [OKAY]....... -........ [OKAY][NO][OKAY]stochastic_transformer - -....... stochastic_transformer [OKAY]stochastic_transformer. - . [NO] [NO]........ .......[OKAY][NO] - [OKAY]....... - [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -op nameop nameop name ................op name................................ installed................installedinstalled .. installed .... compatible ..compatible -compatiblecompatible - --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -cpu_adam cpu_adam............... ...............cpu_adamcpu_adam [NO] [NO] .............................. ....... .......[NO][NO] [OKAY].......[OKAY] - ....... -[OKAY] -[OKAY] -fused_adamfused_adam .......................... [NO]fused_adamfused_adam[NO] ........................................ [OKAY][OKAY][NO] -[NO] - .......fused_lamb....... fused_lamb [OKAY] .............[OKAY] -............. - [NO][NO]fused_lamb fused_lamb........................... [OKAY] ............. -[NO][OKAY] -[NO]....... [OKAY]....... - [OKAY] -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY]sparse_attnsparse_attn - [OKAY]........................transformer - transformer[NO]............[NO] ..........................[NO] [NO][OKAY].......[OKAY] - -.......[OKAY] -transformer[OKAY] -transformer............ stochastic_transformer stochastic_transformer............ [NO] .[NO]........ [NO]....... [OKAY].......[NO][OKAY] - -[OKAY]....... - stochastic_transformer[OKAY] stochastic_transformer - . .[NO] [NO]....... ....... [OKAY][OKAY] - -ninjaninjaninjaninja .................................... .................. ..................[OKAY] [OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop name op nameop name ................ ................................ ................ installed installedinstalledinstalled ........ compatiblecompatiblecompatiblecompatible - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -cpu_adamcpu_adamcpu_adam ...............cpu_adam ..............................[NO]............... [NO][NO][NO]....... ....... .............. [OKAY] - [OKAY][OKAY][OKAY] - - -fused_adam .............fused_adam fused_adamfused_adam [NO] ............. .......................... ....... [NO][NO] [NO] [OKAY] ....... ....... -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY] - - ....... [OKAY][OKAY][OKAY] -fused_lamb - -[OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -----------------------------------------------------------------------------------------------------op nameop name - - .............fused_lamb [NO]fused_lamb............. fused_lamb....... ............. [NO] ............. [OKAY][NO] ....... - [NO][OKAY]....... - op name................op name................ installed................installed ................ .. .. installedinstalled compatible compatible.. - - .......[OKAY] -[OKAY] - ..-------------------------------------------------- --------------------------------------------------compatible -compatible - - ----------------------------------------------------------------------------------------------------- - -sparse_attn ............ [NO] sparse_attn....... ............[OKAY] sparse_attn -cpu_adam ...............cpu_adam [NO]............... cpu_adamcpu_adam .......[NO] [OKAY]..................................... -[OKAY][NO][NO] - .............. [OKAY][OKAY] - -[NO] sparse_attn............transformer ...................[NO]............ [NO][OKAY] ....... [NO] - ....... [OKAY] .......[OKAY] -transformer -fused_adam .............fused_adam [NO]............. fused_adam fused_adam....... [NO] [OKAY]................................. --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - [OKAY]............ - [NO][NO][OKAY] - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at -transformer transformer [NO] ........................ stochastic_transformer .......[NO] [NO] .[OKAY]....... -....... [OKAY][NO] -fused_lamb....... ....... ............. [OKAY]fused_lamb [OKAY] - - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -[OKAY] stochastic_transformer -....... stochastic_transformer [OKAY]. -[NO]............. .......[NO]fused_lamb fused_lamb [OKAY].................... -............. [NO][OKAY][NO] --------------------------------------------------- -JIT compiled ops requires ninja -stochastic_transformer. [NO][NO] ........ [NO] .......[OKAY] -.......[OKAY] -[OKAY] - .............. [OKAY][OKAY] - -sparse_attn ............ [NO]sparse_attn ................... [OKAY][NO] - sparse_attnsparse_attn....... transformer............[OKAY] ............ - ............ [NO]transformer [NO].......[NO]............ .......[OKAY].......[NO] - [OKAY] .......[OKAY] - -transformer[OKAY] -transformer............stochastic_transformer stochastic_transformer............[NO] . .[NO] ....... [NO] [NO]....... [OKAY].............. - [OKAY][OKAY][OKAY] - -stochastic_transformer - stochastic_transformer . [NO]. .......[NO] [OKAY]....... - [OKAY] -ninjaninjaninjaninja .................................... ....................................[OKAY][OKAY] - -[OKAY][OKAY] --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name - op nameop name................op name ................ ................ installed ................ installedinstalled .. .. installed.. compatiblecompatible -.. -compatible ---------------------------------------------------------------------------------------------------- -compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam cpu_adam [NO].............................. ......................[NO][NO] ....... [NO][OKAY]....... - .......[OKAY][OKAY] - -[OKAY] -fused_adam ............. fused_adam[NO]fused_adamfused_adam .............................................. [NO] [NO][OKAY] [NO] -....... ....... ....... [OKAY] [OKAY] -[OKAY] -fused_lamb - fused_lamb.............fused_lamb fused_lamb.............[NO] .......................... [NO].......[NO] [NO] ....... ....... [OKAY][OKAY]....... - - [OKAY][OKAY] - -sparse_attn ............ sparse_attnsparse_attn [NO]............ sparse_attn ............[NO]....... ............[NO][OKAY] ....... -[NO]....... [OKAY]transformer[OKAY]....... - - ............[OKAY]transformer -transformer [NO] transformer............ ............ ....... ............[NO] [NO][NO] [OKAY] .............. - ....... [OKAY] [OKAY] -[OKAY] -stochastic_transformer - stochastic_transformer .stochastic_transformerstochastic_transformer. [NO].[NO] . ....... [NO].......[NO][OKAY] [OKAY] -....... -....... [OKAY][OKAY] - -ninjaninjaninja ninja .................................... .................. .................. [OKAY] [OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name -op nameop name op name ................................ ................ ................ installedinstalled installedinstalled.... .... compatiblecompatible -compatiblecompatible --------------------------------------------------- --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam cpu_adam...............cpu_adam ............... [NO]............... ............... [NO] [NO]....... [NO]....... ....... .......[OKAY] -[OKAY][OKAY][OKAY] - - -fused_adam fused_adam.............fused_adam fused_adam............. [NO] ............. ............. ....... [NO] [NO] [OKAY][NO] .............. -....... [OKAY][OKAY][OKAY] - - -fused_lamb ............. fused_lambfused_lambfused_lamb[NO] ....... .......................... ............. [OKAY] [NO][NO] - [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - -sparse_attn ............ [NO] sparse_attnsparse_attnsparse_attn....... ........................[OKAY] ............ -[NO] [NO][NO]....... .......transformer[OKAY]....... - [OKAY]............ -[OKAY] transformer -[NO] ............transformer.......transformer [NO][OKAY]............ ............ - ....... [NO] [NO] [OKAY] ....... -....... stochastic_transformer [OKAY] [OKAY] -. - stochastic_transformerstochastic_transformer[NO] stochastic_transformer........ . .[OKAY][NO] - [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- - -----------------------------------------------------------------------------------------------------op nameop name - - ................op name................op name installed ................installed .................. installed..compatibleinstalled -compatible .. ---------------------------------------------------.. --------------------------------------------------- compatible - -compatible --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam............... ...............[NO]cpu_adam cpu_adam......................[NO] ...............[OKAY].......[NO] - [OKAY].......[NO] - [OKAY]....... - [OKAY] -fused_adam ............. [NO]fused_adam .................... fused_adam [OKAY] fused_adam[NO] - ............. ....... .............fused_lamb [NO][OKAY] [NO] -.................... fused_lamb[NO]....... [OKAY] .............[OKAY]....... - - [NO][OKAY]fused_lamb -fused_lamb....... ..........................[OKAY] - [NO][NO] .............. [OKAY][OKAY] -sparse_attn - ............ [NO] .......sparse_attn [OKAY]............ - [NO] ....... sparse_attntransformersparse_attn[OKAY] -............ ............ ............ [NO]transformer[NO] [NO]....... ............ ....... ....... [OKAY][NO] -[OKAY] [OKAY]....... - -stochastic_transformer [OKAY]transformer. -transformer [NO]............ stochastic_transformer ................... [NO] [NO] .[OKAY] -.............. [NO][OKAY] [OKAY] -....... - [OKAY]stochastic_transformer -stochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninjaninja .................. ...................................................... [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name ................op name ................ ................installed................installed .. installed..compatibleinstalled - compatible....-------------------------------------------------- - - compatible--------------------------------------------------compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [NO] cpu_adam....... cpu_adam...............[OKAY]cpu_adam [NO].............................. ....... - [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -fused_adam ............. fused_adam .............fused_adamfused_adam [NO].......................... [NO] .......[NO][NO] [OKAY].............. - ....... [OKAY][OKAY]fused_lamb - -[OKAY] -............. fused_lamb[NO] fused_lambfused_lamb............. .................... .............[NO][OKAY] [NO] .......[NO] - .......[OKAY] - .......[OKAY] -[OKAY] -sparse_attn ............ sparse_attn[NO] sparse_attn ............................... sparse_attn[NO] [OKAY][NO] .............. -............ [OKAY] [OKAY] -[NO] - transformer.......transformer transformer ............[OKAY] ............ - ............[NO][NO] transformer[NO]....... ....... ............ ....... [OKAY][NO] [OKAY] -[OKAY] - - .......stochastic_transformerstochastic_transformer [OKAY] stochastic_transformer. . - .[NO] [NO] stochastic_transformer[NO] ....... ........ ....... [OKAY] [OKAY][OKAY] -[NO] - -....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -------------------------------------------------------------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop name op nameop name................ ................................installed................ installed ..installed installed .. compatible.. -.. compatiblecompatible --------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -cpu_adam cpu_adamcpu_adam...............cpu_adam ............... ...............[NO] ............... [NO] ....... [NO] [NO]....... [OKAY]....... - [OKAY].......[OKAY] - -[OKAY] -fused_adam .............fused_adamfused_adam fused_adam.............[NO]............. [NO].............[NO]....... .......[NO].......[OKAY] -[OKAY].......[OKAY] - -[OKAY]fused_lamb -fused_lamb .............fused_lamb............. fused_lamb [NO] .............[NO] ............. ....... .......[NO] [OKAY].......[NO] -[OKAY] -[OKAY]....... - [OKAY] -sparse_attn sparse_attn............ sparse_attnsparse_attn ............ [NO] ............[NO]....... ............ [NO]....... [OKAY] -.......[NO][OKAY] - transformer.......[OKAY] -............ transformer [OKAY][NO]............ -transformer .......transformer[NO] ............ [OKAY] ................... -[NO] [NO][OKAY]....... -stochastic_transformer....... [OKAY][OKAY] -stochastic_transformer. - [NO].stochastic_transformer stochastic_transformer ....... [NO] .[OKAY] . - ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -ninjaninjaninjaninja ...................................................... ..................[OKAY] [OKAY] -[OKAY] -[OKAY]-------------------------------------------------- --------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op name -op name - op name................op name ................installed ................................installed .. installedinstalled.. compatible - ....compatible -------------------------------------------------- - -compatiblecompatible --------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [NO] .......cpu_adam cpu_adam cpu_adam ............... [OKAY]............... - ............... [NO] [NO] [NO] ..................... [OKAY][OKAY] -[OKAY]fused_adam - - ............. [NO] ....... [OKAY] -fused_lamb fused_adam............. fused_adam ............. fused_adam[NO]............. ....... [NO][OKAY].............[NO] -....... [NO] ....... [OKAY] ....... -[OKAY] [OKAY] - -fused_lamb ............. [NO]fused_lamb fused_lambsparse_attn ....... ............. ......................... [OKAY] [NO][NO] - [NO].............. [OKAY].......[OKAY] - -[OKAY] -transformer ............ sparse_attn[NO] ................... [OKAY]sparse_attn[NO] -sparse_attn ............ ....... stochastic_transformer............ [NO] [OKAY] -[NO]........ transformer....... [OKAY][NO] -............[OKAY]....... - transformer[NO][OKAY] ............ -transformer ....... [NO] ............ [OKAY] ....... -[NO] [OKAY]....... -stochastic_transformer [OKAY] -.stochastic_transformer [NO]stochastic_transformer. .......[NO]. .......[NO][OKAY] -[OKAY]....... - [OKAY] -ninjaninjaninjaninja .................................... .................. .................. [OKAY][OKAY] [OKAY] -[OKAY] - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op name -op nameop name op name ................................ ................................installed installedinstalled..installed ......compatible compatible -compatible -compatible---------------------------------------------------------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adamcpu_adam cpu_adam............... ............... ..............................[NO][NO] [NO][NO]....... .....................[OKAY] -[OKAY][OKAY][OKAY] - - -fused_adam ............. fused_adamfused_adamfused_adam[NO] .............................................. [NO][OKAY][NO][NO] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - ..................... fused_lamb[OKAY] [OKAY] - -[OKAY]............. - [NO]fused_lambfused_lamb fused_lamb ....... ............. ..........................[OKAY] [NO] -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report - [NO][NO]....... ..............[OKAY] - [OKAY][OKAY] - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn sparse_attntransformer............sparse_attn ............[NO] ........................ [NO][NO] .......[NO]....... [OKAY].......[OKAY] - ....... -[OKAY] transformer -transformer[OKAY] -........................stochastic_transformer transformer [NO] . [NO]....... ............ [NO] [OKAY]....... -.......[NO] [OKAY]stochastic_transformer.......[OKAY] - - .[OKAY] -stochastic_transformer[NO] stochastic_transformer....... . [OKAY].[NO] - [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -JIT compiled ops requires ninja - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY] -[OKAY][OKAY][OKAY]-------------------------------------------------- - - - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - -op name................op name installedop name ................ ................ .................. installed installedcompatible installed .. - .. --------------------------------------------------.. compatible -compatible - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [NO] cpu_adamcpu_adam....... cpu_adam .............................. [OKAY] ............... -[NO][NO] [NO].............. [OKAY]....... -[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -fused_adam ............. [NO] ....... [OKAY] - - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - ---------------------------------------------------JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -fused_adamfused_adam fused_lambfused_adam ............. ....................................... [NO] [NO] [NO][NO] ....... ....... ....... [OKAY]....... [OKAY] -[OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - - -[OKAY] -fused_lambfused_lambfused_lamb ............. ............. ............. [NO] [NO][NO]....... sparse_attn ..............[OKAY] -............[OKAY][OKAY] - -[NO] ....... [OKAY] -transformer ............ [NO] .......sparse_attn sparse_attn sparse_attn[OKAY] ............ ............ -............[NO] [NO][NO]....... stochastic_transformer .............. [OKAY] -.[OKAY][OKAY] - -[NO]transformer transformertransformer ............................... [OKAY]............ -[NO][NO] [NO].............. .......[OKAY][OKAY] - -[OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer .. . [NO] [NO] [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. ..................[OKAY] [OKAY] -[OKAY] -[OKAY]-------------------------------------------------- - --------------------------------------------------- - ---------------------------------------------------op name--------------------------------------------------op name - -................................ op nameop name installed installed ................................ .. .. installedcompatiblecompatibleinstalled - -..--------------------------------------------------..-------------------------------------------------- - -compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam ............... [NO] .......cpu_adam ...............cpu_adam[OKAY] ............... - [NO] .......[NO]............... [OKAY]....... -[NO] [OKAY]fused_adam....... - ............. [OKAY][NO] - fused_adam....... .............[OKAY] -[NO] ....... fused_lambfused_adam[OKAY] -..........................fused_adam fused_lamb [NO] [NO] ............. ....... ....... [NO] [OKAY][OKAY] - -............. .......[NO]fused_lamb sparse_attn[OKAY]............. -................... [NO][OKAY][NO] - .............. fused_lamb[OKAY][OKAY]sparse_attn - - ............ [NO]transformer............. [NO]................... [NO][OKAY] ....... -....... sparse_attn [OKAY]transformer[OKAY] - ........................ [NO]stochastic_transformer[NO] - ....... ........[OKAY] -[NO][OKAY] -.......transformer [OKAY] stochastic_transformer -............ [NO]. .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer . [NO]sparse_attn ................... [OKAY] -[NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja .................................... .................................... [OKAY] [OKAY] [OKAY] - -[OKAY] --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name -op name op nameop name ................................................................ installed installedinstalled installed .... .. .. compatible compatiblecompatiblecompatible - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -cpu_adamcpu_adamcpu_adam cpu_adam ............................................................ [NO][NO][NO] [NO] .............. ....... .......[OKAY] [OKAY] -[OKAY] -[OKAY] - -fused_adamfused_adamfused_adam fused_adam....................................... ............. [NO][NO] [NO] [NO]....... ....... .............. [OKAY] -[OKAY][OKAY][OKAY] - - -fused_lamb .............fused_lamb fused_lambfused_lamb [NO] ....................................... .......[NO][NO][NO] [OKAY]....... - [OKAY] -.............. [OKAY][OKAY] - -sparse_attn ............ [NO]sparse_attn ................... sparse_attn [OKAY] [NO]sparse_attn............ - ...................[NO]transformer [NO][OKAY]............ -....... ....... transformer[NO] [OKAY] [OKAY] -................... - [OKAY]transformertransformer[NO] - ............................... [NO]stochastic_transformer[NO][OKAY] -............... [OKAY][NO]stochastic_transformer -[OKAY] -....... stochastic_transformer.[OKAY] -stochastic_transformer[NO]. .......[NO]. .......[OKAY][NO] - [OKAY]....... - [OKAY] -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] -[OKAY] - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------op name - op name -................ op name................ op name installed ................ ................installed ..installed .. installedcompatible - ..compatible--------------------------------------------------.. - ---------------------------------------------------compatiblecompatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ...............cpu_adam [NO]............... cpu_adam....... cpu_adam [NO] [OKAY] .............................. - ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -fused_adam ............. [NO] .......fused_adam [OKAY]............. - [NO] fused_adamfused_adam....... fused_lamb ............. ............. .............[OKAY] -[NO][NO][NO] ..............fused_lamb .......[OKAY] -[OKAY]............. - [OKAY][NO] -fused_lamb....... [OKAY].............fused_lamb - sparse_attn[NO]............. ...................[NO] [NO][OKAY]....... - .......sparse_attn[OKAY] -[OKAY]............ - [NO] transformer....... ............[OKAY] -sparse_attn[NO] transformer ............ [NO] .............. ............ [OKAY] [OKAY] -[NO]sparse_attn - ................... stochastic_transformer[NO][OKAY]stochastic_transformer . -....... . [NO] [OKAY]transformer [NO] ....... - ............ .......transformer[OKAY] [OKAY] -............[NO] - .......[NO] .......[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer . [NO]. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop name op name ................ ................ ................................ installed installed installed.. installed .. ....compatiblecompatible - compatible -compatible ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - --------------------------------------------------- -cpu_adam ...............cpu_adam cpu_adamcpu_adam [NO]............................................. [NO][NO] [NO] ..................... [OKAY][OKAY][OKAY] - -....... - [OKAY] -fused_adam fused_adam............. fused_adam.............[NO] .............[NO]....... [NO]....... [OKAY] ....... -[OKAY] -[OKAY] -fused_lambfused_adamfused_lamb fused_lamb ............. ............. .......................... [NO] [NO][NO][NO] ....... .............. [OKAY] .......[OKAY] -[OKAY] - - [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attnsparse_attnsparse_attn .................................... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -transformertransformer transformer ............ ............ sparse_attn............ [NO] [NO]............[NO] [NO]....... .......[OKAY] ....... -....... [OKAY] [OKAY] -[OKAY] -stochastic_transformer - transformer .............stochastic_transformerstochastic_transformer [NO] [NO]. ........ ....... [NO] [NO] [OKAY] -[OKAY].............. -[OKAY][OKAY] - -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name - op nameop name................op name ................ ................ installed................ installed installed .. .. installed.. compatible -compatible..compatible-------------------------------------------------- - - -----------------------------------------------------------------------------------------------------compatible - - --------------------------------------------------- -cpu_adam ............... [NO] cpu_adam.......cpu_adam cpu_adam [OKAY] ............................................. - [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO] ....... [OKAY]fused_adamfused_adamfused_adam - ....................................... fused_lamb [NO] [NO][NO] .................... ....... ....... [OKAY][NO] [OKAY] -[OKAY] -....... - fused_lamb[OKAY] fused_lamb -fused_lamb............. ..........................[NO] [NO][NO]....... ..............[OKAY] - [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -transformersparse_attn ............sparse_attnsparse_attn [NO].................................... .......[NO][NO] [OKAY][NO] ....... - ....... .......[OKAY] -[OKAY]stochastic_transformer[OKAY] - transformer - .transformertransformer............ [NO]........................ [NO] ....... [NO] [NO].............. [OKAY] [OKAY]....... - [OKAY] - -[OKAY]stochastic_transformer - stochastic_transformer . .[NO]stochastic_transformer [NO]....... ........[OKAY] -[NO][OKAY] -....... [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op name - op nameop name................ ................op name ................ installed installed installed.. ................ .. compatible ..installed -compatible ---------------------------------------------------..compatible --------------------------------------------------- - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adam cpu_adam............... cpu_adam............... cpu_adam [NO] ...............[NO] ............... ....... .......[NO] [NO][OKAY] [OKAY] - -.............. [OKAY][OKAY] - -fused_adamfused_adam ..........................fused_adam fused_adam [NO][NO] ............. ............. .............. [NO][NO][OKAY] [OKAY] -....... -....... fused_lamb[OKAY] -[OKAY].............fused_lamb -[NO].............fused_lamb .......[NO]fused_lamb .............[OKAY]....... - .............[NO][OKAY] -[NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY]sparse_attn - ............transformersparse_attnsparse_attn [NO]........................ ............ [NO] ....... [NO][NO]....... [OKAY] .......[OKAY] -....... - [OKAY]transformer[OKAY] - -stochastic_transformer............ transformer[NO]transformer. ...................[NO]............ [OKAY][NO][NO]....... - ..............[OKAY] - stochastic_transformer[OKAY][OKAY] - -. [NO]stochastic_transformerstochastic_transformer ....... .[OKAY]. - [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY] [OKAY] -[OKAY] - ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - ---------------------------------------------------op nameop nameop name - ................op name................ ................ installed ................ installedinstalled .. ..installed .. compatible .. compatiblecompatible - - -------------------------------------------------- -compatible ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ...............cpu_adam cpu_adam cpu_adam[NO] .................................................... [NO][OKAY][NO] -[NO] ....... .............. [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO] ....... [OKAY] -fused_adamfused_adamfused_adamfused_lamb ............. .......................... ............. [NO] [NO][NO][NO] ....... .............. ....... [OKAY][OKAY] [OKAY] -[OKAY] - - -fused_lamb .............fused_lamb fused_lamb [NO] ................................. [NO][OKAY][NO]sparse_attn - .......................... [OKAY][NO][OKAY] - -....... [OKAY] -sparse_attn transformer............ ............ [NO]sparse_attn[NO] sparse_attn .......................... [OKAY][OKAY][NO] -............ - [NO].......stochastic_transformer transformer [OKAY]....... .............[OKAY] - -[NO][NO] transformer.............. transformer [OKAY]............[OKAY]............ -[NO] - [NO]....... stochastic_transformer....... [OKAY][OKAY] -. - [NO]stochastic_transformer .......stochastic_transformer . [OKAY] -.[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja -JIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op name - op nameop nameop name ................ ................ ................................ installed installed installedinstalled.. .. ....compatible - compatiblecompatible--------------------------------------------------compatible - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [NO] cpu_adamcpu_adam.......cpu_adam [OKAY]............................................. - [NO][NO] .............. [OKAY][OKAY] - -[NO] ....... fused_adam[OKAY] -............. [NO] ....... fused_adamfused_adam[OKAY] -.......................... [NO][NO]fused_lamb ....................fused_adam ....... [NO][OKAY][OKAY] - -.................... [OKAY][NO]fused_lamb -fused_lamb ................................. [NO][NO] .............. [OKAY][OKAY] -[OKAY] -sparse_attn - fused_lamb............ [NO] .................... [NO] [OKAY]....... [OKAY] - -sparse_attnsparse_attn transformer........................ ............[NO] [NO] [NO].............. [OKAY].......[OKAY] -sparse_attn - [OKAY]............ - transformertransformer[NO] ........................ stochastic_transformer[NO][NO] ............... ....... [OKAY][OKAY] - [NO] -[OKAY] ....... -stochastic_transformer transformer[OKAY] -............stochastic_transformer. [NO][NO]. ..............[NO] [OKAY].......[OKAY] - -[OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] - - ---------------------------------------------------[OKAY] ----------------------------------------------------------------------------------------------------- - -op name - --------------------------------------------------op nameop name................ - ................op name................installed installed..................installed ..compatible.. -installed -------------------------------------------------- compatible -compatible -.. - ----------------------------------------------------------------------------------------------------compatible - - -cpu_adam-------------------------------------------------- -............... [NO] cpu_adam.......cpu_adam ............... [OKAY][NO]............... -cpu_adam .......[NO]............... [OKAY].......[NO] - fused_adam[OKAY]....... -............. [OKAY][NO] -....... [OKAY]fused_adam - ............. [NO]fused_adam fused_lamb .......fused_adam .......................... [OKAY] .............[NO][NO] - [NO].......fused_lamb....... [OKAY].............[OKAY]....... - [NO] -[OKAY] fused_lamb....... - .............[OKAY] - fused_lamb[NO] .............sparse_attn....... [OKAY][NO]............ - .......[NO] .......sparse_attn[OKAY] -[OKAY]............ - [NO] .......transformer sparse_attn[OKAY]............ - [NO]............transformer ....... ............sparse_attn [NO][NO] [OKAY]............ - ....... ....... [OKAY][NO]stochastic_transformer[OKAY] - -.......transformer . [OKAY] stochastic_transformer............[NO] - [NO]........transformer [OKAY] -.......[NO]............ .......[OKAY][NO] -[OKAY] -.......stochastic_transformer [OKAY] -. [NO]stochastic_transformer ....... [OKAY]. - [NO] ....... [OKAY] -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] - - --------------------------------------------------- ---------------------------------------------------[OKAY]-------------------------------------------------- -op name - - op nameop name................ --------------------------------------------------................................installed - installedinstalled.. ..op name.. compatiblecompatible................ - - compatible----------------------------------------------------------------------------------------------------installed - - - --------------------------------------------------.. - compatible --------------------------------------------------- -cpu_adamcpu_adam .............................. cpu_adam [NO][NO]cpu_adam ............................................ [OKAY] [NO][OKAY] -[NO] - .............. [OKAY][OKAY] - -fused_adam .............fused_adam [NO]............. .......[NO]fused_adam fused_adam [OKAY] -....... ............. ............. [OKAY] [NO]fused_lamb[NO] - ........................... [OKAY][NO]fused_lamb[OKAY] -.................... - fused_lamb[OKAY][NO] - fused_lamb.................... .............[OKAY][NO] - [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn ............ transformer[NO]sparse_attn sparse_attn ................... ............ ............[NO][OKAY] - .......[NO][NO] transformer [OKAY] .............. -............ [OKAY][OKAY][NO]stochastic_transformer - - ....... transformertransformer[OKAY]. - ............[NO]stochastic_transformer............ [NO] ........ [NO] [OKAY] .......[NO] -....... .......[OKAY][OKAY] - -[OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utilstransformer_inference .................... [NO][NO] .............. [OKAY][OKAY] - -quantizer utils.............. ..................[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... async_io[OKAY] - ............... utils[NO] ......................... [NO][NO] -....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. [NO]-------------------------------------------------- -....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[NO] [OKAY]....... - [OKAY] -utils ..................quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -async_io ............... [NO] ....... [NO]transformer_inference - .. [NO] ....... [OKAY] -utils transformer_inference.................. ..[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer utils.............. ..................[NO] .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... [NO]............... .......[NO] [NO]....... - [NO] -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [NO] ....... utils[OKAY] -.................. [NO] ....... quantizer[OKAY] -.............. [NO] ....... quantizer[OKAY] -.............. [NO] .......-------------------------------------------------- -[OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -utils ..................utils [NO].................. [NO]....... .......[OKAY] -[OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference [WARNING]  async_io: please install the libaio-devel package with yum .. [NO] ....... - [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ...............utils [NO].................. .......[NO] [NO]....... - [OKAY] -quantizer .............. [NO] ....... transformer_inference[OKAY] -.. [NO] ....... --------------------------------------------------[OKAY] - -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -DeepSpeed general environment info:utils .................. -[NO] ....... [OKAY] -torch install path quantizer............... .............. [NO] ....... [OKAY] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] --------------------------------------------------- -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: - [WARNING]  async_io: please install the libaio-devel package with yum -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [NO] ....... [OKAY] -async_ioquantizer ............................. [NO][NO] ....... .......[NO] -[OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -DeepSpeed general environment info:DeepSpeed general environment info: - -transformer_inference .. [NO] ....... [OKAY] -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version ....................torch version 1.8.1.................... - 1.8.1 -torch cuda version ...............torch cuda version 11.1............... - nvcc version11.1 -utils .................. [NO] ....... [OKAY] -.....................nvcc version 11.2..................... - deepspeed install path11.2 -...........deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -quantizer .............. [NO] ....... [OKAY] -deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -...................deepspeed info 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix................... - deepspeed wheel compiled w.0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -utils .................. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.-------------------------------------------------- - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] ....... [OKAY] --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found................ - [NO] ....... [NO] -transformer_inferenceasync_io .. ...............[NO] .......[NO] [OKAY]....... - [NO] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yumquantizer .............. [NO] - ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] .......  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.[OKAY] - -utils .................. [NO] ....... [OKAY] -async_io quantizer............... ..............[NO] [NO]....... .......[NO] [OKAY] - --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -torch cuda version ............... 11.1 -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda version torch cuda version............... ...............11.1 -11.1 -DeepSpeed general environment info: -nvcc version nvcc version..................... .....................11.2 -11.2 -deepspeed install path deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -0.5.5+57dee5a, 57dee5a, pp_deadlock_fixdeepspeed wheel compiled w. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.------------------------------------------------------------------------------------------------------------------------------------------------------ - - --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -JIT compiled ops requires ninja - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1 -nvcc version nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -0.5.5+57dee5a, 57dee5a, pp_deadlock_fixdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY][OKAY] - - - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- ---------------------------------------------------op name - -op name op name................ ................op name................ ................installed installedinstalled installed.... .. .. compatiblecompatible compatible - -compatible-------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- - -cpu_adamcpu_adam cpu_adam cpu_adam............... .............................. ...............[NO] [NO][NO]....... [NO] .......[OKAY]....... -....... [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO] .......fused_adam fused_adamfused_adam [OKAY]............. ............. - .............[NO][NO] fused_lamb [NO]........................... .......[OKAY][NO][OKAY] - -.......[OKAY] fused_lamb -[OKAY] fused_lamb -............. .............fused_lamb[NO] [NO] .................... ....... [NO] [OKAY] -[OKAY].......sparse_attn - [OKAY]............ -[NO] ....... [OKAY] -sparse_attntransformer sparse_attn........................sparse_attn [NO] ............ [NO]............ .............. [NO] [OKAY][NO] - [OKAY] stochastic_transformer....... ....... - .[OKAY][OKAY] -transformer -[NO] transformer.......transformer............ ............[NO][OKAY]............ - ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] -stochastic_transformer - . stochastic_transformerstochastic_transformer[NO] .. ....... [NO][OKAY][NO] - .............. [OKAY][OKAY] - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ...............DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch install path - ............... torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch cuda version - ............... torch version11.1 -....................nvcc version 1.8.1..................... - 11.2torch cuda version - deepspeed install path............... ...........11.1 -nvcc version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']..................... - 11.2deepspeed info - deepspeed install path................... ...........0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w.['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -...... deepspeed infotorch 1.8, cuda 11.1 -................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version torch version.................... ....................1.8.1 -1.8.1torch cuda version - ...............torch cuda version 11.1............... - nvcc version11.1 -.....................nvcc version 11.2..................... - deepspeed install path11.2 -...........deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -...................deepspeed info 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix................... - 0.5.5+57dee5a, 57dee5a, pp_deadlock_fixdeepspeed wheel compiled w. - ......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=829cefd git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=829cefd git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=829cefd git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main ******** Git info for Megatron: git_hash=829cefd git_branch=main **** - -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1645939.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 55 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.02 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_bnb_optimizer ............................... False - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main ******** Git info for Megatron: git_hash=829cefd git_branch=main **** - -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.-------------------------------------------------- - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main ******** Git info for Megatron: git_hash=829cefd git_branch=main **** - -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -DeepSpeed general environment info: -torch install path DeepSpeed general environment info:............... -torch install path['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - ...............torch version .................... 1.8.1 -torch cuda version['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -............... 11.1torch version - nvcc version.................... .....................1.8.1 -11.2 -deepspeed install pathtorch cuda version .......................... 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']nvcc version - deepspeed info..................... ...................11.2 -0.5.5+57dee5a, 57dee5a, pp_deadlock_fixdeepspeed install path - deepspeed wheel compiled w............ ...... torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+57dee5a, 57dee5a, pp_deadlock_fix -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -> setting tensorboard ... -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -**** Git info for Megatron: git_hash=829cefd git_branch=main **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-22 06:34:46,025] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' -g++ -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color -I/gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -I/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/pybind11/include helpers.cpp -o helpers.cpython-38-x86_64-linux-gnu.so -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' ->>> done with dataset index builder. Compilation time: 6.052 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -[1/3] c++ -MMD -MF scaled_upper_triang_masked_softmax.o.d -DTORCH_EXTENSION_NAME=scaled_upper_triang_masked_softmax_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/TH -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/THC -isystem /gpfslocalsys/cuda/11.2/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -c /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -o scaled_upper_triang_masked_softmax.o -[2/3] /gpfslocalsys/cuda/11.2/bin/nvcc --generate-dependencies-with-compile --dependency-output scaled_upper_triang_masked_softmax_cuda.cuda.o.d -DTORCH_EXTENSION_NAME=scaled_upper_triang_masked_softmax_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/TH -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/THC -isystem /gpfslocalsys/cuda/11.2/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 -gencode arch=compute_70,code=sm_70 --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda -gencode arch=compute_80,code=sm_80 -std=c++14 -c /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -o scaled_upper_triang_masked_softmax_cuda.cuda.o -[3/3] c++ scaled_upper_triang_masked_softmax.o scaled_upper_triang_masked_softmax_cuda.cuda.o -shared -L/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/gpfslocalsys/cuda/11.2/lib64 -lcudart -o scaled_upper_triang_masked_softmax_cuda.so -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -[1/3] c++ -MMD -MF scaled_masked_softmax.o.d -DTORCH_EXTENSION_NAME=scaled_masked_softmax_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/TH -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/THC -isystem /gpfslocalsys/cuda/11.2/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -c /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -o scaled_masked_softmax.o -[2/3] /gpfslocalsys/cuda/11.2/bin/nvcc --generate-dependencies-with-compile --dependency-output scaled_masked_softmax_cuda.cuda.o.d -DTORCH_EXTENSION_NAME=scaled_masked_softmax_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/TH -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/THC -isystem /gpfslocalsys/cuda/11.2/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 -gencode arch=compute_70,code=sm_70 --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda -gencode arch=compute_80,code=sm_80 -std=c++14 -c /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -o scaled_masked_softmax_cuda.cuda.o -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h(320): warning: variable "batch_count" was declared but never referenced - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h(320): warning: variable "batch_count" was declared but never referenced - -[3/3] c++ scaled_masked_softmax.o scaled_masked_softmax_cuda.cuda.o -shared -L/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/gpfslocalsys/cuda/11.2/lib64 -lcudart -o scaled_masked_softmax_cuda.so -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -[1/3] c++ -MMD -MF layer_norm_cuda.o.d -DTORCH_EXTENSION_NAME=fused_mix_prec_layer_norm_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/TH -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/THC -isystem /gpfslocalsys/cuda/11.2/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -c /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -o layer_norm_cuda.o -[2/3] /gpfslocalsys/cuda/11.2/bin/nvcc --generate-dependencies-with-compile --dependency-output layer_norm_cuda_kernel.cuda.o.d -DTORCH_EXTENSION_NAME=fused_mix_prec_layer_norm_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/TH -isystem /gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/include/THC -isystem /gpfslocalsys/cuda/11.2/include -isystem /gpfswork/rech/six/commun/conda/cutting-edge/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 -gencode arch=compute_70,code=sm_70 --use_fast_math -maxrregcount=50 -gencode arch=compute_80,code=sm_80 -std=c++14 -c /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -o layer_norm_cuda_kernel.cuda.o -[3/3] c++ layer_norm_cuda.o layer_norm_cuda_kernel.cuda.o -shared -L/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/gpfslocalsys/cuda/11.2/lib64 -lcudart -o fused_mix_prec_layer_norm_cuda.so -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 162.654 seconds -time to initialize megatron (seconds): 142.840 -[after megatron is initialized] datetime: 2021-10-22 06:37:34 -building GPT model ... -[2021-10-22 06:37:34,906] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-22 06:37:34,907] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-22 06:37:34,908] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.18 GB, percent = 20.9% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-22 06:37:36,589] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 237, in - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/training.py", line 134, in pretrain - model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/training.py", line 342, in setup_model_and_optimizer - model = get_model(model_provider_func) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/training.py", line 240, in get_model - model = model_provider_func( - File "/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py", line 68, in model_provider - model = GPTModelPipe( - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/model/gpt_model.py", line 279, in __init__ - super().__init__(layers=self.specs, - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed/runtime/pipe/module.py", line 200, in __init__ - self._build() - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed/runtime/pipe/module.py", line 248, in _build - module = layer.build() - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed/runtime/pipe/module.py", line 70, in build - return self.typename(*self.module_args, **self.module_kwargs) - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/model/transformer.py", line 441, in __init__ - self.self_attention = ParallelAttention( - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/model/transformer.py", line 151, in __init__ - self.query_key_value = mpu.ColumnParallelLinear( - File "/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/mpu/layers.py", line 259, in __init__ - self.weight = Parameter(torch.empty( -RuntimeError: CUDA out of memory. Tried to allocate 194.00 MiB (GPU 2; 31.75 GiB total capacity; 12.00 MiB already allocated; 157.00 MiB free; 18.00 MiB reserved in total by PyTorch) -Killing subprocess 2924509 -Killing subprocess 2924510 -Killing subprocess 2924511 -Killing subprocess 2924512 -Traceback (most recent call last): - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 194, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/runpy.py", line 87, in _run_code - exec(code, run_globals) - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 340, in - main() - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 326, in main - sigkill_handler(signal.SIGTERM, None) # not coming back - File "/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler - raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) -subprocess.CalledProcessError: Command '['/gpfswork/rech/six/commun/conda/cutting-edge/bin/python', '-u', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/pretrain_gpt.py', '--local_rank=3', '--tensor-model-parallel-size', '4', '--pipeline-model-parallel-size', '32', '--num-layers', '64', '--hidden-size', '11600', '--num-attention-heads', '80', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '1', '--global-batch-size', '2048', '--train-samples', '600_000_000', '--train-tokens', '300_000_000_000', '--vocab-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json', '--merge-file', '/gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt', '--loss-scale', '12', '--fp16', '--checkpoint-activations', '--seed', '43', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-8', '--lr', '6e-5', '--min-lr', '6e-6', '--lr-warmup-samples', '216_320', '--lr-decay-tokens', '260000000000', '--lr-decay-style', 'cosine', '--clip-grad', '1.0', '--weight-decay', '1e-1', '--exit-duration-in-mins', '55', '--log-interval', '1', '--save-interval', '300', '--eval-interval', '1000', '--eval-iters', '5', '--tensorboard-dir', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard', '--tensorboard-queue-size', '5', '--log-timers-to-tensorboard', '--log-batch-size-to-tensorboard', '--log-validation-ppl-to-tensorboard', '--save', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--load', '/gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints', '--data-path', '/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document', '--data-impl', 'mmap', '--split', '949,50,1', '--distributed-backend', 'nccl', '--deepspeed', '--deepspeed_config', './ds_config.1645939.json', '--zero-stage', '1', '--deepspeed-activation-checkpointing']' returned non-zero exit status 1. -srun: error: r8i2n4: task 22: Exited with exit code 1 -srun: Terminating job step 1645939.0 -slurmstepd: error: *** STEP 1645939.0 ON r6i3n3 CANCELLED AT 2021-10-22T06:37:39 *** -Killing subprocess 2330903 -Killing subprocess 2330904 -Killing subprocess 2330905 -Killing subprocess 2330906 -Killing subprocess 2151458 -Main process received SIGTERM, exiting -Killing subprocess 1978795 -Killing subprocess 2151459 -Killing subprocess 2151460 -Killing subprocess 2151462 -Main process received SIGTERM, exiting -Killing subprocess 1978796 -Killing subprocess 195931 -Killing subprocess 1978797 -Killing subprocess 1978798 -Main process received SIGTERM, exiting -Killing subprocess 2190049 -Killing subprocess 195932 -Killing subprocess 2190050 -Killing subprocess 2190051 -Killing subprocess 2190053 -Killing subprocess 195933 -Killing subprocess 195934 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 518488 -Killing subprocess 518489 -Killing subprocess 518490 -Killing subprocess 518491 -Main process received SIGTERM, exiting -Killing subprocess 2778649 -Killing subprocess 604324 -Killing subprocess 3025916 -Killing subprocess 604325 -Killing subprocess 2778650 -Killing subprocess 3080690 -Killing subprocess 767148 -Killing subprocess 3025917 -Killing subprocess 2778651 -Killing subprocess 2789903 -Killing subprocess 3080691 -Killing subprocess 1230242 -Killing subprocess 3025918 -Killing subprocess 604326 -Killing subprocess 767149 -Killing subprocess 2174109 -Killing subprocess 2789904 -Killing subprocess 610053 -Killing subprocess 2778652 -Killing subprocess 1230243 -Killing subprocess 2174110 -Killing subprocess 767150 -Killing subprocess 2789905 -Killing subprocess 610054 -Killing subprocess 604327 -Killing subprocess 2171255 -Killing subprocess 1230244 -Killing subprocess 767151 -Main process received SIGTERM, exiting -Killing subprocess 3080692 -Killing subprocess 610055 -Killing subprocess 3080693 -Killing subprocess 1000808 -Killing subprocess 2171256 -Main process received SIGTERM, exiting -Killing subprocess 1230245 -Killing subprocess 3025919 -Main process received SIGTERM, exiting -Killing subprocess 610056 -Killing subprocess 1000809 -Killing subprocess 2174111 -Main process received SIGTERM, exiting -Killing subprocess 2174112 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 2789907 -Killing subprocess 2171257 -Main process received SIGTERM, exiting -Killing subprocess 2171258 -Killing subprocess 1000810 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 1000811 -Main process received SIGTERM, exiting -Killing subprocess 3527695 -Killing subprocess 3527696 -Killing subprocess 3527697 -Killing subprocess 3527698 -Main process received SIGTERM, exiting -Killing subprocess 1931279 -Killing subprocess 1931280 -Killing subprocess 1931281 -Killing subprocess 1931282 -Main process received SIGTERM, exiting -Killing subprocess 865870 -Killing subprocess 865871 -Killing subprocess 865872 -Killing subprocess 865873 -Main process received SIGTERM, exiting -Killing subprocess 966479 -Killing subprocess 966480 -Killing subprocess 966481 -Killing subprocess 966482 -Main process received SIGTERM, exiting -Killing subprocess 2004240 -Killing subprocess 2399152 -Killing subprocess 2004241 -Killing subprocess 2399153 -Killing subprocess 2004242 -Killing subprocess 2004243 -Main process received SIGTERM, exiting -Killing subprocess 2399154 -Killing subprocess 2278198 -Killing subprocess 2278199 -Killing subprocess 2399155 -Main process received SIGTERM, exiting -Killing subprocess 2278200 -Killing subprocess 2278202 -Main process received SIGTERM, exiting -Killing subprocess 1900979 -Killing subprocess 1900980 -Killing subprocess 1900981 -Killing subprocess 1900982 -Main process received SIGTERM, exiting -Killing subprocess 3130893 -Killing subprocess 3130894 -Killing subprocess 3130895 -Killing subprocess 3130896 -Main process received SIGTERM, exiting -Killing subprocess 2132490 -Killing subprocess 2132491 -Killing subprocess 2132492 -Killing subprocess 2132494 -Main process received SIGTERM, exiting -Killing subprocess 2027567 -Killing subprocess 2027568 -Killing subprocess 1923123 -Killing subprocess 1923124 -Killing subprocess 1923125 -Killing subprocess 2027569 -Killing subprocess 2027570 -Main process received SIGTERM, exiting -Killing subprocess 1923126 -Main process received SIGTERM, exiting -Killing subprocess 2024004 -Killing subprocess 2024005 -Killing subprocess 2024006 -Killing subprocess 2024007 -Main process received SIGTERM, exiting -Killing subprocess 1934525 -Killing subprocess 1934526 -Killing subprocess 1934527 -Killing subprocess 1934528 -Main process received SIGTERM, exiting -srun: error: r6i4n1: task 1: Exited with exit code 1 -srun: error: r8i7n8: task 27: Exited with exit code 1 -srun: error: r6i4n4: task 4: Exited with exit code 1 -srun: error: r9i0n0: task 28: Exited with exit code 1 -srun: error: r9i0n1: task 29: Exited with exit code 1 -srun: error: r6i6n1: task 5: Exited with exit code 1 -srun: error: r6i4n2: task 2: Exited with exit code 1 -srun: error: r6i4n3: task 3: Exited with exit code 1 -srun: error: r8i2n5: task 23: Exited with exit code 1 -srun: error: r8i2n7: task 25: Exited with exit code 1 -srun: error: r8i2n2: task 20: Exited with exit code 1 -srun: error: r8i2n3: task 21: Exited with exit code 1 -srun: error: r8i2n8: task 26: Exited with exit code 1 -srun: error: r8i2n0: task 18: Exited with exit code 1 -srun: error: r8i1n2: task 11: Exited with exit code 1 -srun: error: r7i1n4: task 6: Exited with exit code 1 -srun: error: r8i2n1: task 19: Exited with exit code 1 -srun: error: r9i0n3: task 30: Exited with exit code 1 -srun: error: r8i2n6: task 24: Exited with exit code 1 -srun: error: r8i1n4: task 13: Exited with exit code 1 -srun: error: r7i1n6: task 7: Exited with exit code 1 -srun: error: r8i1n3: task 12: Exited with exit code 1 -srun: error: r6i3n3: task 0: Exited with exit code 1 -srun: error: r7i4n3: task 8: Exited with exit code 1 -srun: error: r8i1n7: task 16: Exited with exit code 1 -srun: error: r8i1n6: task 15: Exited with exit code 1 -srun: error: r8i1n8: task 17: Exited with exit code 1 -srun: error: r7i6n5: task 9: Exited with exit code 1 -srun: error: r8i0n7: task 10: Exited with exit code 1 -srun: error: r9i6n0: task 31: Exited with exit code 1 -srun: error: r8i1n5: task 14: Exited with exit code 1 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb .............fused_lamb [NO]............. .......[NO] [OKAY]....... - [OKAY] -sparse_attn sparse_attn............ ............[NO] [NO]....... ....... [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninja .................. [OKAY] --------------------------------------------------- -op name ninja................ ..................installed [OKAY] -.. --------------------------------------------------compatible - -op name-------------------------------------------------- ................ - installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] fused_adam....... .............[OKAY] -[NO] ....... [OKAY] -fused_lamb ............. [NO] .......sparse_attn ............ [NO][OKAY] -....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO]sparse_attn ................... [OKAY][NO] - ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY] [OKAY] - -[OKAY] --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op nameop name -op name ................op name................ ................................installed installed installed installed.. .. .. ..compatible compatible compatible - -compatible ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- - -cpu_adamcpu_adam cpu_adamcpu_adam ............... ............... ..............................[NO] [NO][NO][NO]....... .......[OKAY].............. - [OKAY][OKAY] -[OKAY] - -fused_adam ............. fused_adam[NO]fused_adamfused_adam ....... ............. .......................... [OKAY] - [NO][NO][NO] fused_lamb .................................. [OKAY] [OKAY] -[NO][OKAY] - -....... fused_lamb[OKAY] fused_lamb............. -fused_lamb [NO].......................... .......[NO][NO] [OKAY] -.............. sparse_attn[OKAY][OKAY] - -............ [NO] ....... [OKAY] -sparse_attntransformer ........................ sparse_attn[NO][NO]sparse_attn ....... ................... ............[OKAY] [OKAY][NO] -[NO] - .......stochastic_transformer.......transformer [OKAY] ............. -[OKAY] [NO][NO] - transformer.............. [OKAY]transformer[OKAY] -............ - ............[NO] [NO].......stochastic_transformer [OKAY]....... - .[OKAY] -[NO]stochastic_transformer .......stochastic_transformer .[OKAY] -[NO]. .......[NO] [OKAY]....... - [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op name op nameop nameop name................ ................................................installed installedinstalled.. installed .. ..compatible ..compatible - -compatible--------------------------------------------------compatible-------------------------------------------------- - - - --------------------------------------------------- --------------------------------------------------- -cpu_adamcpu_adam cpu_adamcpu_adam.............................. ............... ...............[NO] [NO][NO] [NO] .............. .............. [OKAY][OKAY] - [OKAY] -[OKAY] - -fused_adam .............fused_adamfused_adam fused_adam [NO] .............................................. [OKAY][NO][NO][NO] - ..................... fused_lamb [OKAY][OKAY] - [OKAY]............. - - fused_lamb[NO] ....................fused_lambfused_lamb [NO] [OKAY] .......................... - ....... [NO][OKAY][NO] - .............. [OKAY][OKAY] - -sparse_attn ............ [NO] .......sparse_attn [OKAY] -............ sparse_attnsparse_attn[NO] transformer ........................................... [NO] [NO] [OKAY] [NO] -.............. transformer .......[OKAY] [OKAY] - -............[OKAY] -[NO]stochastic_transformer transformer........ transformer ............ [OKAY][NO] ............ - [NO] ....... [NO] ....... [OKAY]stochastic_transformer -....... [OKAY] -[OKAY]. - stochastic_transformer[NO] .......stochastic_transformer [OKAY]. - . [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja ---------------------------------------------------JIT compiled ops requires ninja --------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. .................. [OKAY][OKAY][OKAY] - - -[OKAY]------------------------------------------------------------------------------------------------------------------------------------------------------ - - - -op name-------------------------------------------------- -................op nameop name op nameinstalled ................ ................ .. installed ................ compatible - ..installedinstalled-------------------------------------------------- compatible .. -.. - compatible--------------------------------------------------compatible - - --------------------------------------------------- ---------------------------------------------------cpu_adam - ............... [NO] .......cpu_adam [OKAY] cpu_adam -............... cpu_adam...............[NO] ...............[NO]....... [OKAY].......fused_adam -[NO] .............[OKAY] ....... -[NO] [OKAY]....... -[OKAY] -fused_adam .............fused_lamb [NO].............fused_adam .......[NO] ............. fused_adam.......[OKAY] - [NO][OKAY]............. fused_lamb - [NO].................... .......[NO][OKAY] -[OKAY]....... - [OKAY]fused_lamb -sparse_attn fused_lamb ......................... .............[NO][NO] [NO].............. [OKAY].......[OKAY] -sparse_attn - [OKAY]............ -transformer [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformersparse_attn ............ ............stochastic_transformer[NO] sparse_attn [NO] ........ ............ .......[OKAY][NO] -[NO].......[OKAY] -[OKAY]stochastic_transformer....... - transformer [OKAY]. - ............[NO] transformer [NO] ....... ...................[OKAY] -[NO][OKAY] -....... [OKAY] -stochastic_transformer .stochastic_transformer [NO] ........ [OKAY][NO] - ....... [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] - -[OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - ---------------------------------------------------op nameop name - op name................op name................ ................installedinstalled................ installed .. .. installed..compatible -compatible..compatible --------------------------------------------------- - -----------------------------------------------------------------------------------------------------compatible - - --------------------------------------------------- -cpu_adam ...............cpu_adam cpu_adam[NO] cpu_adam...................... ............... ...............[OKAY][NO][NO] - ....... [NO].......[OKAY] -[OKAY]....... - fused_adam[OKAY] -............. [NO] ....... [OKAY] -fused_adam fused_adam.............fused_lamb .......................... [NO] fused_adam[NO]....... .............[NO] [OKAY] ....... -....... [NO] [OKAY] [OKAY] -fused_lamb....... - .............[OKAY]fused_lamb - [NO]............. fused_lamb....... [OKAY][NO] -............. sparse_attn ....... [NO] ............[OKAY] -.......[NO] [OKAY]....... -[OKAY]sparse_attn - ............ [NO] transformer....... ............[OKAY]sparse_attn -[NO] ...................transformer sparse_attn [NO] [OKAY] ............................... - [NO][NO]stochastic_transformer[OKAY] ....... -....... . [OKAY]transformer [OKAY] - -[NO]............ transformerstochastic_transformer.......[NO] ...................[OKAY] -. [OKAY][NO][NO] - .............. [OKAY] stochastic_transformer -[OKAY] -. [NO] .......stochastic_transformer [OKAY] -. [NO] ....... [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -------------------------------------------------------------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... ..................[OKAY] [OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op nameop name -op name op name................................................ installedinstalled ................installed .. ....installedcompatible -compatible compatible-------------------------------------------------- -.. - --------------------------------------------------- -------------------------------------------------- -compatible - --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam cpu_adam .......cpu_adam............... [OKAY]..............................[NO] - [NO].......[NO] .......[OKAY]....... - [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_adamfused_lamb ..........................fused_adamfused_adam [NO][NO]............. ............. ....... ....... [NO][OKAY][NO][OKAY] - -.............. fused_lamb[OKAY][OKAY] - -............. [NO] fused_lamb.......fused_lamb .............[OKAY]sparse_attn -.............[NO] ............ [NO] ....... [NO] .......[OKAY]....... - [OKAY][OKAY] -sparse_attn - ............ [NO]transformer ................... [NO][OKAY] sparse_attn -....... transformersparse_attn ............[OKAY] -............[NO]............ stochastic_transformer [NO] [NO]....... . ....... .......[OKAY][NO] -[OKAY] [OKAY].......transformer - - [OKAY]............ -transformer stochastic_transformer [NO] ............ . .......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -stochastic_transformer . stochastic_transformer[NO] ........ [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------JIT compiled ops requires ninja - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................................................................ installed installed installedinstalled.... ..compatible .. - compatiblecompatible - -------------------------------------------------- -compatible ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adamcpu_adamcpu_adam ...............cpu_adam.............................. [NO] [NO] ...............[NO]....... .......[NO] [OKAY] ....... -[OKAY] ....... -[OKAY] -[OKAY] -fused_adam ............. fused_adam[NO] fused_adam fused_adam.................... ............. [OKAY].............[NO] - [NO][NO]....... fused_lamb [OKAY]........................... - [NO]fused_lamb [OKAY][OKAY] -.................... - [NO][OKAY]fused_lamb -fused_lamb....... .............[OKAY]............. - [NO][NO] .............. [OKAY][OKAY] - -sparse_attn ............ [NO] ....... sparse_attn[OKAY] -............ [NO] .......sparse_attn transformersparse_attn[OKAY] - .................................... transformer[NO] [NO] [NO]............ ..............[NO]....... [OKAY][OKAY][OKAY]....... - - -[OKAY]stochastic_transformertransformer - transformer ............. ............[NO]stochastic_transformer [NO][NO] ...................... [OKAY][NO][OKAY][OKAY] - - -....... [OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - - -[OKAY]---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op nameop name-------------------------------------------------- - - ................op name................ op nameinstalled................ installed ................installed.... ..compatible compatibleinstalled -compatible - --------------------------------------------------.. - --------------------------------------------------- -------------------------------------------------- -compatible - --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam [NO] cpu_adam ............... ...................... ............... [NO][OKAY][NO] - [NO].............. .......[OKAY][OKAY] - -[OKAY] -fused_adam ............. fused_adamfused_adam[NO] fused_adam ............. ................................. [NO][OKAY] [NO] -[NO] ....... ....... ....... [OKAY]fused_lamb[OKAY] -[OKAY] - -............. fused_lambfused_lamb[NO]fused_lamb .......................... ....... [NO][NO]............. .......[OKAY].......[NO] - [OKAY][OKAY]....... - - [OKAY] -sparse_attnsparse_attn sparse_attn............ sparse_attn ............[NO]........................ [NO][NO].......[NO] ....... [OKAY]....... ....... -[OKAY] [OKAY] -transformer[OKAY] - -............ transformertransformer[NO] transformer ............ ...............................[NO] [OKAY].......[NO][NO] - [OKAY].............. - [OKAY][OKAY] -stochastic_transformer - stochastic_transformer .stochastic_transformer. stochastic_transformer [NO] [NO] . ....... ........ [NO] [OKAY] [NO] -....... [OKAY] ....... -[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -------------------------------------------------------------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................................... [OKAY][OKAY] [OKAY] - -[OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op name op name................op nameop name ................ installed................ ................ ..installed installed compatibleinstalled .. .. -compatible.. -------------------------------------------------- - - compatiblecompatible --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO] cpu_adam....... cpu_adam...............cpu_adam [OKAY] ...............[NO] -............... [NO][NO]....... ....... [OKAY] ....... -[OKAY] -[OKAY]fused_adam - ............. [NO] ....... [OKAY] -fused_adam fused_adam.............fused_lambfused_adam ..........................[NO]............. [NO].......[NO][NO] .....................[OKAY] [OKAY] -[OKAY] -[OKAY] - -fused_lamb fused_lambfused_lamb............. ............. .............[NO] [NO][NO]....... ..............sparse_attn[OKAY] -[OKAY]............[OKAY] - -[NO] ....... [OKAY] -transformer ............ [NO]sparse_attn .......sparse_attn............ sparse_attn[OKAY]............ [NO] -............ [NO] ....... stochastic_transformer.......[NO][OKAY] [OKAY] -....... -. [OKAY][NO] -transformer transformer ....... ............transformer ............ [OKAY][NO][NO] - ............ .............. [NO][OKAY][OKAY] - -....... [OKAY]stochastic_transformer -stochastic_transformer .stochastic_transformer. [NO][NO] . ....... ....... [NO] [OKAY] [OKAY] -....... - [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -ninjaninjaninjaninja .................. ......................................................[OKAY] -[OKAY][OKAY][OKAY] -op nameop name op nameop name................................ installed................................ installed ..installed installed .. compatible..compatible -.. ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- -op name-------------------------------------------------- - op name -................op name op name................installed ................ .................. installed installed compatibleinstalled -compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -.. -------------------------------------------------- .. -.. compatible compatible -compatible ----------------------------------------------------------------------------------------------------- - - ---------------------------------------------------cpu_adam -cpu_adam cpu_adam............... cpu_adam...............cpu_adam[NO] [NO]..................................... ....... [NO][OKAY][NO] - [OKAY].............. - ............... [NO] ....... cpu_adam[OKAY]cpu_adam - [OKAY][OKAY] - - ...............cpu_adam ............... [NO]............... .......[NO][NO] [OKAY]fused_adam....... - ....... .............[OKAY] -fused_adam ............. [NO] fused_adam....... .............fused_adamfused_adam [OKAY] [NO] -[OKAY][NO] - ....... fused_adam[OKAY] -............. ............. .......[NO] fused_lamb [NO][OKAY] -............. [NO] fused_lamb....... .............[OKAY]fused_adam -........................... [OKAY]fused_lamb[OKAY][NO] - -.................... [OKAY][NO] - fused_adam[NO] .............fused_lamb.................... [NO][OKAY]............. [NO] -[NO]....... ..............[OKAY] -fused_lamb.......fused_lamb [OKAY].......................... - [OKAY][OKAY] - - [NO][NO] .............. [OKAY][OKAY] - -fused_lamb ............. [NO]fused_lamb .................... sparse_attn [OKAY] [NO] -sparse_attn ............ [NO] .......sparse_attn [OKAY]............ -............ sparse_attn[NO]....... ................... [OKAY] [NO] - [NO] transformer....... sparse_attnsparse_attn ............ [OKAY] ............[NO] -[OKAY] -.......sparse_attn [OKAY]transformer - ............ [NO] ....... [NO] transformer.......[OKAY] -...................[OKAY] -[NO][OKAY]stochastic_transformer transformer - ........................ transformer[NO][NO] ............ sparse_attn....... ....... [NO] [OKAY] ............[OKAY] -....... - ................... . transformer[OKAY][NO] - [NO][OKAY]stochastic_transformertransformer - [NO]................... stochastic_transformer .......[NO][OKAY] [OKAY] -........ - [NO][OKAY] stochastic_transformer....... - [OKAY] - .................... stochastic_transformer [NO][OKAY] [NO] -.stochastic_transformer [NO] ........ [OKAY][NO] - ....... [OKAY] -....... . .......transformer[NO] [OKAY] [OKAY]....... -............ - [OKAY] -[NO]stochastic_transformer ....... [OKAY]. - [NO] ....... stochastic_transformer[OKAY] -. [NO] ....... [OKAY] --------------------------------------------------- -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY] -[OKAY] --------------------------------------------------- - -[OKAY]-------------------------------------------------- - -op name-------------------------------------------------- op name --------------------------------------------------- ................op name................ - installedinstalledop name ................ .. ..................installedcompatible -compatible --------------------------------------------------installed -.. - -------------------------------------------------- .. -compatible -compatible --------------------------------------------------- --------------------------------------------------- -cpu_adam ...............cpu_adam [NO]............... .......cpu_adam[NO] [OKAY]cpu_adam...................... - [OKAY]...............[NO] - .......[NO] [OKAY]....... - [OKAY] -fused_adam ............. fused_adam[NO] .................... [NO][OKAY]fused_adam -fused_adam ....... ............. fused_lamb[OKAY] ............. - [NO][NO].............fused_lamb ...........................[NO] [OKAY][NO] - .......[OKAY]....... -[OKAY]fused_lamb - [OKAY]fused_lamb -............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] sparse_attn....... ............[OKAY] -[NO]sparse_attn sparse_attn.......transformer ............ ........................ [NO] [OKAY][NO] [NO] - ..................... transformer[OKAY] [OKAY] -[OKAY]............ - - transformer[NO]transformer stochastic_transformer............................... [OKAY][NO].[NO] - [NO].............. stochastic_transformer .......[OKAY][OKAY] -[OKAY] -. - stochastic_transformer[NO]stochastic_transformer ........ . [OKAY][NO] - [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] -[OKAY] - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------op name - -op name op nameop name ................ ................installed................................ installed installed.. installed ..compatible -.... compatible--------------------------------------------------compatible -compatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [NO] cpu_adam.......cpu_adamcpu_adam [OKAY].............................. -............... [NO][NO][NO] ..................... [OKAY] [OKAY] -[OKAY] -fused_adam - ............. [NO] ....... [OKAY] -fused_adam fused_adam.............fused_lambfused_adam [NO].......................... ............. .......[NO] [NO][NO] [OKAY] .............. - ....... [OKAY] fused_lamb[OKAY][OKAY] - - -............. [NO] fused_lamb.......fused_lamb [OKAY].......................... - [NO][NO] .............. [OKAY][OKAY]sparse_attn - -............ [NO] ....... [OKAY]sparse_attn - ............ [NO] transformer....... sparse_attn ............sparse_attn [OKAY] [NO] -........................ .......transformer[NO][NO] ...................[OKAY] ....... - [NO] [OKAY] [OKAY]....... -stochastic_transformer - [OKAY]transformer -transformer. ............ stochastic_transformer............ [NO] [NO][NO] ........ .......[OKAY][NO] -....... [OKAY][OKAY]....... - - [OKAY] -stochastic_transformerstochastic_transformer .. [NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report------------------------------------------------------------------------------------------------------------------------------------------------------ - - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. ...................................................... [OKAY][OKAY][OKAY] -ninjaninjaninjaninja .................................... .................. .................. [OKAY][OKAY][OKAY][OKAY] - - - - ---------------------------------------------------[OKAY] - --------------------------------------------------- - -op name---------------------------------------------------------------------------------------------------- op name ----------------------------------------------------------------------------------------------------- - -----------------------------------------------------------------------------------------------------op nameop name - -................ -op name ................installed op name installed .. ................................ .. compatibleinstalledcompatibleinstalled - ---------------------------------------------------....-------------------------------------------------- - op name................................op name installed ................ installed..................installed compatible.. installed -..compatible -------------------------------------------------- -.. -compatible-------------------------------------------------- - -compatible-------------------------------------------------- - - compatiblecompatible - ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam ...................... cpu_adam cpu_adam[OKAY][NO] -cpu_adam ............... cpu_adam[NO] ......................cpu_adam cpu_adam [NO][OKAY] -...................... ............... [NO][OKAY] -[NO]....... .......[OKAY] -fused_adam[OKAY] -.............................. ....... [NO] [NO] [OKAY] ....... -............. [NO] .......fused_adam [OKAY]............. -....... [OKAY][OKAY]fused_adam - - fused_adam[NO] fused_adam....................fused_lamb [OKAY] .......................... - ............. [NO] ....... [OKAY]fused_adam -[NO] [NO][NO] .......fused_lamb....... .............[OKAY].......[OKAY] - -[NO][OKAY] - ............. [NO]fused_lamb ....... fused_adamfused_adam ............. [OKAY] .......................... -[NO] fused_lamb[NO].......[NO] .............[OKAY].............. -.......fused_lamb [OKAY].............fused_lamb - [OKAY][NO][OKAY] - -....... [OKAY]fused_lambfused_lamb - .............[NO] [NO]....... sparse_attn ....... [OKAY]............ - .......................... [NO][NO]sparse_attn .......................... [OKAY][NO][OKAY] -sparse_attn - [OKAY][NO] - ....... [OKAY] - ................... [OKAY][NO] - ....... transformer[OKAY] -sparse_attntransformer ........................ [NO][NO] sparse_attn .......sparse_attn....... [OKAY]............[OKAY]............ - -............ [NO]transformer ...................sparse_attnsparse_attn [OKAY][NO]............ - [NO] [NO]....... transformer ....... stochastic_transformer[OKAY] ............ [OKAY] - - ............ ....... [NO] stochastic_transformer[NO] [OKAY] .............. - . [OKAY] [OKAY]stochastic_transformer[NO] - -.[NO] transformertransformer.......[NO] ............................... [OKAY] [OKAY] - - .......transformer. transformer[OKAY]............ -[NO][NO] .............. stochastic_transformer[OKAY][OKAY] - -[NO]............[NO] ....... [NO] ....... [OKAY] -.......[OKAY] -. stochastic_transformerstochastic_transformer[NO] ........ .[OKAY] -[OKAY] -[NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.-------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizerquantizer .............. ..............[NO] [NO]....... ....... [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version ............... ............... 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yumasync_io - ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] -transformer_inferenceasync_io .. [NO]............... .......[NO] [OKAY]....... - [NO] -utils .................. [NO] ....... [OKAY] -DeepSpeed general environment info: -quantizer ..............transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 --------------------------------------------------- -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] -utils .................. [NO] quantizer....... ..............[OKAY] -[NO] ....... [OKAY] -quantizer .............. [NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... ...............[NO] .......[NO] [NO]....... - [NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... [OKAY]....... - [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. utils[NO] ......................... [NO][OKAY] -....... [OKAY] -utils .................. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] -quantizer .............. [NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io async_io............... [NO]............... .......[NO] [NO]....... - [NO] -transformer_inference .. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] -utils .................. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed info deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -DeepSpeed general environment info: -torch version .................... 1.8.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -torch version .................... 1.8.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install pathDeepSpeed general environment info: ............... - torch install path ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']............... - torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch cuda version ............... torch version11.1 -.................... nvcc version1.8.1 -..................... 11.2torch cuda version - deepspeed install path............... ...........11.1 -nvcc version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']..................... - 11.2deepspeed info - deepspeed install path................... ...........0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']...... - torch 1.8, cuda 11.1deepspeed info - ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ...............torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -.................... 1.8.1 -torch version torch cuda version.................... ...............1.8.1 -11.1 -torch cuda versionnvcc version .................................... 11.211.1 - -deepspeed install pathnvcc version ................................ 11.2 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed install path - ...........deepspeed info ................... 0.5.5+29bee73, 29bee73, master -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed wheel compiled w. deepspeed info...... ...................torch 1.8, cuda 11.1 -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. ..................[OKAY] [OKAY] [OKAY] - -[OKAY] ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- ---------------------------------------------------op name -op name - op name................op name ................installed ................ .................. installed installedcompatible -..installed.. --------------------------------------------------compatible -compatible -..-------------------------------------------------- - - --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam cpu_adam............... [NO]...............cpu_adam cpu_adam....... [NO] .............................. [OKAY] ....... -[NO][NO] [OKAY].............. - [OKAY][OKAY] - -fused_adam ............. [NO] .......fused_adam [OKAY]fused_adam - .............fused_adam............. fused_lamb[NO] ..........................[NO] ..............[NO][NO] [OKAY] [OKAY]....... -....... - [OKAY][OKAY]fused_lamb -fused_lamb - .......................... fused_lamb [NO][NO] .................... ....... [NO] [OKAY] sparse_attn[OKAY] - ....... -............ [OKAY][NO] - ....... [OKAY] -sparse_attntransformer ........................sparse_attn [NO] [NO]sparse_attn ................... ................... [NO][OKAY][OKAY] [NO] -....... - .......[OKAY] stochastic_transformer -[OKAY]transformer - transformer............. [NO]transformer............[NO] ..........................[NO] [OKAY] - [NO][OKAY]....... - .......[OKAY] -[OKAY] -stochastic_transformer .stochastic_transformerstochastic_transformer [NO] ......... [OKAY][NO][NO] - .............. [OKAY][OKAY] - -ninjaninjaninja ...................................................... ninja [OKAY][OKAY][OKAY] - -.................. ----------------------------------------------------------------------------------------------------- -------------------------------------------------- - -[OKAY] -op name -op name op name--------------------------------------------------................ - installed................op name ................ .. ................installed installed compatible installed -.... -------------------------------------------------- ..compatible - compatible -compatible ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [NO] cpu_adam.......cpu_adam cpu_adam...............[OKAY] ............... -............... [NO][NO][NO] ..................... [OKAY]fused_adam[OKAY] - [OKAY] -............. - [NO] ....... [OKAY] -fused_lambfused_adam .............fused_adam.............fused_adam [NO] [NO]............. ............. .............. [NO] [NO] [OKAY][OKAY] ....... - -....... [OKAY][OKAY] -fused_lamb - .............fused_lamb [NO] .............fused_lamb....... sparse_attn ............. [OKAY][NO] ............ - [NO] .......[NO]....... [OKAY].......[OKAY] - -[OKAY] -transformer sparse_attn............ ............[NO] [NO]....... .......[OKAY] sparse_attn -sparse_attn[OKAY] -stochastic_transformer........................ transformer[NO][NO]. [NO] ................... ....... ....... [NO] [OKAY] [OKAY][OKAY] - -....... -transformer [OKAY]............ -transformer [NO]............ .......[NO] stochastic_transformer[OKAY]....... - [OKAY]. - stochastic_transformer[NO] stochastic_transformer....... . [OKAY].[NO] - [NO]....... .......[OKAY] -[OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. ..................[OKAY][OKAY][OKAY] - - -[OKAY]-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name-------------------------------------------------- - -op name................op name ................installedop name ................ installed.. ................ installed..compatible -installedcompatible..-------------------------------------------------- - -..--------------------------------------------------compatible - -compatible --------------------------------------------------- --------------------------------------------------- -cpu_adam ...............cpu_adam [NO]............... cpu_adamcpu_adam.......[NO] ............... ............... [OKAY] ....... -[NO] [NO] [OKAY] ....... -....... [OKAY] -[OKAY] -fused_adam ............. fused_adam[NO] fused_adam .............fused_adam....... [NO].............[OKAY] ............. -....... [NO] [NO] fused_lamb [OKAY]....... ....... - ............. [OKAY] [OKAY] -fused_lamb -[NO] ....................fused_lamb [NO]fused_lamb[OKAY] -....... ............. .............[OKAY] -[NO][NO] .............. [OKAY][OKAY] - -sparse_attn ............ [NO] .......sparse_attn [OKAY]............ - [NO]sparse_attnsparse_attn ....... transformer........................ [OKAY]............[NO] -[NO] ....... transformer[NO] ....... [OKAY]............ ....... - [OKAY] [NO] -[OKAY] transformer -....... transformer............[OKAY] -stochastic_transformer............[NO] [NO]stochastic_transformer ........ ....... [OKAY].[NO] - [NO][OKAY]....... -stochastic_transformer....... [OKAY][OKAY] -stochastic_transformer -. [NO] ........ [NO][OKAY] -....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1655850.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 55 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.02 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_bnb_optimizer ............................... False - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -op nameop nameop nameop name ................................ ................ ................ installedinstalledinstalled installed .... .. compatible..compatiblecompatible - - ---------------------------------------------------compatible---------------------------------------------------------------------------------------------------- - - - --------------------------------------------------- -cpu_adamcpu_adamcpu_adam ...............cpu_adam ............... ............... ...............[NO] [NO] .......[NO][NO]....... [OKAY].......[OKAY]....... - - [OKAY][OKAY] - -fused_adamfused_adamfused_adam fused_adam .......................... ............. [NO]............. [NO] [NO] .......[NO] ....... ....... [OKAY]....... [OKAY] -[OKAY] -[OKAY] - -fused_lambfused_lambfused_lambfused_lamb .................................................... [NO][NO][NO][NO] ............................ [OKAY][OKAY][OKAY][OKAY] - - - -sparse_attn sparse_attn............sparse_attnsparse_attn ............[NO]........................ [NO] ....... [NO][NO] [OKAY]....... -....... ....... [OKAY] [OKAY] -transformer[OKAY] - -............transformer transformer transformer [NO] ............ ........................ ....... [NO] [NO][NO] [OKAY] -..................... [OKAY][OKAY]stochastic_transformer[OKAY] - - - .stochastic_transformerstochastic_transformer stochastic_transformer [NO] . ......... [NO][OKAY][NO][NO] - ..................... [OKAY][OKAY][OKAY] - - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja -JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - - -[OKAY]-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -op nameop nameop name op name ................ ................ ................................installed installedinstalled installed.. .... compatible compatible -compatible.. --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - -compatible -cpu_adam-------------------------------------------------- -...............cpu_adam cpu_adam[NO]............... ......................[NO] cpu_adam [NO] .......[OKAY] ............... -.......[OKAY] -[OKAY] - [NO]fused_adam ............. fused_adam[NO]fused_adam....... ............. ....... ............. [NO][OKAY][OKAY] - .......[NO] ....... -fused_lamb [OKAY] [OKAY]............. - - [NO] ....... fused_lambfused_lamb[OKAY] -fused_adam .......................... [NO][NO] ........................... [OKAY][NO][OKAY] - -sparse_attn ................... [NO] ....... [OKAY][OKAY] - -transformersparse_attn sparse_attnfused_lamb ............ ............ ............ [NO] [NO][NO]....... ............. .............. [OKAY] [OKAY] -[OKAY][NO] - -stochastic_transformertransformertransformer ......................... [NO][NO][NO]....... ..................... [OKAY][OKAY] - -[OKAY][OKAY] - -stochastic_transformer . [NO]stochastic_transformer ....... [OKAY]. - [NO] sparse_attn....... ............[OKAY] - [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. [OKAY].................................... - [OKAY][OKAY]--------------------------------------------------[OKAY] - - - ---------------------------------------------------op name---------------------------------------------------------------------------------------------------- - - -................op name op name op nameinstalled................................ ................ installed .. installed .. compatible installed -compatible .. ---------------------------------------------------.. -------------------------------------------------- compatible - -compatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .......cpu_adam cpu_adam....... [OKAY] ............... -............... [OKAY] [NO][NO] - .............. [OKAY][OKAY] - -fused_adam ............. [NO] fused_adam....... .............[OKAY] -[NO]fused_adam .......fused_adamfused_lamb............. .............[OKAY][NO]............. - [NO].......[NO] fused_lamb.......[OKAY]....... - .............[OKAY][OKAY] - -[NO]fused_lamb ....................fused_lamb [NO][OKAY]............. - .......[NO] [OKAY] sparse_attn -....... ............[OKAY] -[NO] .......sparse_attn [OKAY]............ - [NO] transformer....... ............[OKAY] -sparse_attn[NO] sparse_attntransformer................... ............[NO]............[OKAY] [NO] -[NO]....... .......[OKAY]....... - stochastic_transformer [OKAY] [OKAY] -transformer - ............. transformerstochastic_transformer[NO] [NO] ................... ........ [OKAY][NO][NO] [OKAY] - ....... -....... [OKAY][OKAY] - -stochastic_transformer . stochastic_transformer[NO] ........ [OKAY][NO] - ....... [OKAY] -ninjaninjaninjaninja .................................... .................. ..................[OKAY][OKAY] - -[OKAY][OKAY] --------------------------------------------------- - ---------------------------------------------------op name---------------------------------------------------------------------------------------------------- - - -................op nameop name op name installed................ ................ ................ ..installed installed installed.. compatible .. - .. compatible--------------------------------------------------compatible - - -compatible-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... cpu_adam[NO]cpu_adamcpu_adam .................................................... [OKAY][NO][NO][NO] - ..................... [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO] .......fused_adamfused_adam fused_adam [OKAY] ....................................... - [NO][NO][NO] fused_lamb ....... ....... .................... [OKAY][OKAY] [OKAY] - -[NO] - .......fused_lambfused_lamb fused_lamb [OKAY] -....................................... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO] ....... [OKAY] -transformersparse_attnsparse_attn sparse_attn ................................................ [NO][NO][NO][NO] ....... ..................... [OKAY][OKAY][OKAY] -[OKAY] - - -transformer transformertransformerstochastic_transformer............ ........................[NO] .[NO][NO] ....... ....... .......[NO][OKAY] [OKAY]....... -[OKAY] - -[OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer .. . [NO] [NO] [NO] ....... ....... ....... [OKAY] -[OKAY][OKAY] - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - --------------------------------------------------- - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop name op nameop name ................ ................ ................................ installed installedinstalledinstalled .... .... compatible compatiblecompatible - -compatible ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - --------------------------------------------------- -cpu_adam ...............cpu_adamcpu_adam ...............[NO]cpu_adam............... ....... [NO] ...............[NO] [OKAY] -.......[NO]....... [OKAY][OKAY]....... - - [OKAY] -fused_adam ............. [NO] ....... fused_adam[OKAY] fused_adam -............. fused_adam.............[NO] ............. fused_lamb [NO] [NO] .................... ....... ....... [NO][OKAY] [OKAY] -....... -[OKAY] -[OKAY] -fused_lambfused_lamb fused_lamb.......................... [NO].............[NO] .......[NO]....... sparse_attn[OKAY]....... -............[OKAY] -[OKAY][NO] - ....... [OKAY] -transformer ............ [NO]sparse_attn ................... [OKAY]sparse_attn[NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - ............sparse_attn....... stochastic_transformer [NO][OKAY]............ - ........[NO] [OKAY] transformer - [NO]................... .......transformer[OKAY][NO] - [OKAY]................... - transformer [NO][OKAY] -................... [OKAY][NO]stochastic_transformer - ....... .[OKAY] stochastic_transformer -[NO] ........ stochastic_transformer[OKAY][NO] -........ [OKAY] -[NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -------------------------------------------------------------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................................... [OKAY] [OKAY][OKAY] -[OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -op name-------------------------------------------------- op nameop name................ - installedop name................ ................ .. ................ installedinstalled compatible installed.. - .. --------------------------------------------------.. - compatiblecompatiblecompatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- -cpu_adam - - ............... [NO] ....... [OKAY] -cpu_adamcpu_adamcpu_adam ............................................. [NO][NO][NO] ....... ....... .......fused_adam[OKAY] -[OKAY][OKAY]............. - -[NO] ....... [OKAY] -fused_lamb fused_adam............. fused_adamfused_adam............. [NO] ............. .............[NO]....... .......[NO][OKAY] [NO] -[OKAY] -.............. fused_lamb[OKAY] [OKAY] -............. - [NO] .......fused_lambsparse_attnfused_lamb [OKAY] ............. -......................... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -sparse_attn transformer............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attnsparse_attntransformerstochastic_transformer ............ ........................[NO] . [NO] [NO][NO] ............................ [OKAY][OKAY][OKAY] -[OKAY] - - -stochastic_transformer transformertransformer. ........................[NO] [NO][NO]....... ....... [OKAY] ....... -[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer. [NO]. .......[NO] [OKAY]....... - [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - - -JIT compiled ops requires ninja -ninjaninjaninja ninja .................. .................................... .................. [OKAY] -[OKAY][OKAY][OKAY] --------------------------------------------------- - - --------------------------------------------------- ---------------------------------------------------op name --------------------------------------------------- op name -op name ................ op name................................installed installed ................installed.. ....installed compatible compatible - -compatible.. ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -compatible - --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam [NO]...............cpu_adam ............... [NO] ....... ............... ....... [NO][OKAY] [NO] -[OKAY] ....... -....... [OKAY][OKAY] - -fused_adam ............. [NO]fused_adam ....... fused_adam[OKAY] -.............fused_adam............. [NO] .............[NO]fused_lamb ....... .................... [NO][OKAY] [OKAY] - -[NO]....... ....... fused_lamb[OKAY]fused_lamb[OKAY] - -............. .............[NO] fused_lamb[NO]....... ....................[OKAY] -[NO][OKAY] -....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attnsparse_attntransformer ........................sparse_attn ............ [NO] ............[NO][NO]....... .......[NO].......[OKAY] -[OKAY].......[OKAY] - -transformer[OKAY] -............transformer stochastic_transformer transformer............[NO] .............[NO] ....... [NO][NO]....... .......[OKAY].......[OKAY] - -[OKAY] -[OKAY] -stochastic_transformerstochastic_transformer .stochastic_transformer . [NO] [NO]........ [NO].......[OKAY] -.......[OKAY] -[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found................ - [NO] ....... [NO] -async_io ............... [NO] .......transformer_inference [NO].. - [NO] ....... [OKAY] -utils .................. [NO] .......transformer_inference [OKAY].. - [NO] ....... [OKAY]quantizer - .............. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop name op nameop name ................................ ................ installed................ installed .. installed.. installed .. compatible -compatible..--------------------------------------------------compatible - - ---------------------------------------------------compatible --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [NO] ....... cpu_adamcpu_adam[OKAY]cpu_adam ............... - ............... ............... [NO] [NO] [NO] ....... ....... ....... [OKAY] [OKAY]fused_adam -[OKAY] -............. - [NO] ....... [OKAY] -fused_adam fused_lambfused_adam............. fused_adam ............. [NO] .............[NO] ............. [NO]..............[NO] [OKAY] - .......[OKAY]....... -[OKAY] -[OKAY]fused_lamb - .............fused_lamb [NO]fused_lambsparse_attn............. ................................[NO] [OKAY] [NO] -.......[NO] [OKAY].............. - [OKAY][OKAY] - -transformer ............ [NO]sparse_attn ....... ............[OKAY] -sparse_attn[NO]sparse_attn ...................stochastic_transformer ............ [OKAY]. [NO] - [NO] [NO] transformer....... ....... ................... [OKAY][NO][OKAY] -....... -[OKAY] -[OKAY]transformertransformer - ........................ [NO]stochastic_transformer [NO] .............. . [OKAY] [OKAY] -[NO] - ....... [OKAY]stochastic_transformerstochastic_transformer - .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. ------------------------------------------------------------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -DeepSpeed C++/CUDA extension op report----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- -JIT compiled ops requires ninja - -JIT compiled ops requires ninja -JIT compiled ops requires ninja - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  async_io: please install the libaio-devel package with yum - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninja .................. ....................................ninja[OKAY] [OKAY].................. -[OKAY] --------------------------------------------------- -[OKAY]-------------------------------------------------- - --------------------------------------------------- -op name-------------------------------------------------- op name - -................ op nameop name ................ ................................ installed installedinstalledinstalled ........ compatiblecompatiblecompatible -compatible - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -cpu_adamcpu_adamcpu_adam cpu_adam .............................. ............... ...............[NO] [NO] [NO] [NO] .............. ....... ....... [OKAY][OKAY] -[OKAY][OKAY] - - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -fused_adamfused_adam fused_adamfused_adam ............. ............. ............. [NO][NO].............[NO] .....................[NO] [OKAY][OKAY] [OKAY] - -....... - [OKAY]fused_lamb -fused_lamb fused_lamb ............. .............fused_lamb ............. [NO][NO].............[NO] .....................[NO] [OKAY] - [OKAY][OKAY]....... - - [OKAY] -sparse_attn ............ [NO]sparse_attnsparse_attn sparse_attn ....... ............ ........................ [OKAY][NO] [NO] -[NO] ....... .......transformer ....... [OKAY] [OKAY]............ -[OKAY] - -[NO]transformer transformertransformer ............................... [NO][NO][OKAY]............ -....... ....... [NO][OKAY][OKAY] -.......stochastic_transformer -[OKAY] -. stochastic_transformer[NO] stochastic_transformer stochastic_transformer........ . [NO]. [OKAY] [NO] -[NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja --------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja .................................... .................. ..................[OKAY][OKAY][OKAY] - -[OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop nameop name - ................op name................................ ................installedinstalledinstalled ..installed.... compatible -compatiblecompatible..-------------------------------------------------- - - ---------------------------------------------------compatible-------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -[NO]cpu_adam ...................... fused_adam[NO] fused_adam ............. ....... [OKAY]............. [NO] - [OKAY] [NO] -....... .......[OKAY] -[OKAY] -fused_lamb fused_lamb............. .............[NO] fused_adam [NO] ....... ............. ....... [OKAY] [NO] -fused_adam[OKAY] -....... [OKAY] - fused_lamb............. .............sparse_attn [NO]............sparse_attn .......[NO]............ [NO][OKAY][NO] -....... [OKAY] -transformer....... ................... [NO] .......sparse_attn[OKAY] -............[OKAY][OKAY] - -[NO] fused_lamb....... transformer ............. [OKAY]stochastic_transformer............ - [NO] transformer........ ............[NO] [OKAY][NO] - ....... [OKAY] -stochastic_transformer [NO]........ stochastic_transformer .......[OKAY][NO]. - .......[NO] [OKAY]....... - [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------JIT compiled ops requires ninja--------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY]ninja - --------------------------------------------------.................. - [OKAY]op name - ................-------------------------------------------------- -installedop name .................. compatibleinstalled - ..-------------------------------------------------- -compatible --------------------------------------------------- -cpu_adam ...............cpu_adam [NO]............... [NO]....... ....... [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY] -[OKAY] -sparse_attnsparse_attn ............ ............[NO] .......[NO] [OKAY]....... - [OKAY]transformer - ............ [NO]transformer ................... [OKAY] -[NO] ....... [OKAY]stochastic_transformer - . stochastic_transformer[NO] ....... .[OKAY] - [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninjafused_adam ............................... [NO][OKAY] -.......-------------------------------------------------- -[OKAY] -op name ................ installedfused_lamb ............... compatible -[NO] --------------------------------------------------....... - [OKAY] -cpu_adam ............... [NO] .......sparse_attn [OKAY]............ - [NO] ....... [OKAY] -transformer ............ [NO] .......fused_adam [OKAY]............. - [NO] ....... [OKAY]stochastic_transformer - .fused_lamb [NO]............. [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] -[OKAY] - ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- - -op nameop nameop name op name ................................ ................installed................ installed installed installed.. .. ..compatible..compatible - - compatible----------------------------------------------------------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam ..............................cpu_adamcpu_adam [NO] [NO].............................. .......[NO][NO]....... ....... [OKAY].......[OKAY] - -[OKAY][OKAY] - -fused_adamfused_adam .............fused_adam............. fused_adam [NO]............. [NO] ............. .......[NO].......[NO] .......[OKAY]....... -[OKAY] [OKAY] -fused_lamb[OKAY] - -............. [NO]fused_lambfused_lamb fused_lamb....... ............. .............[OKAY] ............. - [NO] [NO] [NO]....... ..............[OKAY] -[OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY]sparse_attn -DeepSpeed general environment info: - ............sparse_attntransformersparse_attn [NO] ............ ............................... [NO] .......[NO][OKAY][NO] - [OKAY].............. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -transformer [OKAY][OKAY]............ -stochastic_transformer -torch cuda version ............... 11.1 - [NO]transformer.transformer .......[NO]........................ ....... [NO][OKAY][NO] -nvcc version ..................... 11.2 - [OKAY] ....... - .......stochastic_transformer[OKAY] -[OKAY] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -. [NO] stochastic_transformer.......stochastic_transformer [OKAY]. -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -. [NO][NO] .............. [OKAY][OKAY] - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... DeepSpeed general environment info:1.8.1 - -torch cuda version ............... 11.1 -torch install pathnvcc version .................................... 11.2 -deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed infotorch version ....................................... 0.5.5+29bee73, 29bee73, master1.8.1 - -deepspeed wheel compiled w. ......torch cuda version torch 1.8, cuda 11.1............... - 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ...............utils .................. [NO] ....... [OKAY] - [NO]quantizer .............. [NO] ....... [OKAY] -....... [NO] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io .............................. [NO][NO]async_io ....... ....... ............... [NO][NO] - -[NO] ....... [NO] -transformer_inferencetransformer_inference ..transformer_inference.. [NO] ..[NO] .......[NO]....... [OKAY].......[OKAY] - -[OKAY] -utils utils..................utils ..................[NO].................. [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - -quantizer ..............quantizer quantizer [NO] .............. .............. ....... [NO] [NO] [OKAY] -.............. [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]async_io ....... ...............[NO] -[NO] ....... [NO] -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -utils ..................utils [NO].................. .......[NO] [OKAY]....... - [OKAY] [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -quantizer - .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- -async_io-------------------------------------------------- -............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1DeepSpeed general environment info: -nvcc version -..................... 11.2 -deepspeed install path torch install path........... ............... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1torch version - .................... 1.8.1 -torch cuda version ............... 11.1 -DeepSpeed general environment info:nvcc version ..................... - 11.2 -deepspeed install path ...........torch install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']............... - deepspeed info ................... 0.5.5+29bee73, 29bee73, master -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']deepspeed wheel compiled w. - ...... torch versiontorch 1.8, cuda 11.1 -.................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -------------------------------------------------------------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -ninjaninjaninja ninja.................................... [OKAY]..................[OKAY].................. - - --------------------------------------------------[OKAY][OKAY]-------------------------------------------------- - - - -op name-------------------------------------------------- -op name--------------------------------------------------................ op name - ................installedop name................ installed ..................installed ..compatibleinstalled -.. -------------------------------------------------- compatible.. -compatible - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [NO] .......cpu_adam cpu_adam cpu_adam[OKAY] .............................. - ...............[NO][NO] [NO]....... ....... fused_adam[OKAY] -.......[OKAY] -[OKAY]............. - [NO] ....... [OKAY] -fused_adam fused_lamb.............fused_adam fused_adam.............[NO] ................................. [NO][NO] [OKAY] -[NO].............. [OKAY] - fused_lamb.......[OKAY] -.............[OKAY] -[NO]fused_lamb ....... fused_lamb.............[OKAY] sparse_attn - ............. [NO] ............ [NO] ....... [NO] ....... [OKAY] ....... - [OKAY]sparse_attn[OKAY] - -............ [NO]transformer ................... [OKAY][NO] -....... sparse_attntransformer[OKAY] sparse_attn -........................ [NO] ............[NO].......stochastic_transformer ....... [NO] [OKAY][OKAY] . - - .......[NO]transformer stochastic_transformer.......[OKAY]............ - . [OKAY][NO]transformer[NO] ....... - ............ ....... [OKAY][NO][OKAY] - -....... [OKAY]stochastic_transformer - . [NO] stochastic_transformer....... [OKAY] -. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install pathDeepSpeed general environment info: -............... torch install path ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']............... - torch version .................... 1.8.1['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch cuda version torch version............... ....................11.1 -1.8.1nvcc version - .....................torch cuda version 11.2............... - deepspeed install path11.1 -...........nvcc version ..................... 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed install pathdeepspeed info .............................. 0.5.5+29bee73, 29bee73, master -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed wheel compiled w. - ......deepspeed info torch 1.8, cuda 11.1................... - 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ...............async_io [NO] ...................... [NO][NO] -....... [NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja ---------------------------------------------------JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils ..................utils [NO].................. .......[NO] [OKAY]....... - [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -ninjaninjaninjaninja .................................... .................................... [OKAY] [OKAY][OKAY] - -[OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name -op name op name ................................ ................ ................ installedinstalledinstalled .... installed .. compatiblecompatible .. -compatible - -----------------------------------------------------------------------------------------------------compatible-------------------------------------------------- - - - --------------------------------------------------- -cpu_adamcpu_adamcpu_adam ..............................cpu_adam............... [NO][NO][NO] ............... ....... ....... [NO] .......[OKAY][OKAY] - -.......[OKAY] -[OKAY] -fused_adamfused_adam .......................... [NO]fused_adam[NO] fused_adam........................... [OKAY].............[NO][OKAY] - -.......[NO] fused_lamb [OKAY] fused_lamb....... -............. .............[OKAY][NO] -fused_lamb [NO] .......fused_lamb.................... [OKAY] .............[NO][OKAY] -[NO] -....... .......[OKAY] - [OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY]sparse_attn[OKAY] -sparse_attn - ............transformer............ transformer [NO] ............ ............ ....... [NO] [NO][NO].......[OKAY] - ..............[OKAY] -transformer[OKAY][OKAY] transformer - -............ ............[NO] [NO]stochastic_transformer.......stochastic_transformer [OKAY] ....... - ..[OKAY] -[NO][NO]stochastic_transformer .............. .stochastic_transformer[OKAY][OKAY] -[NO] - ....... .[OKAY] -[NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference .. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -utils ..................utils [NO].................. .......[NO] [OKAY]....... - [OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... [NO]async_io ....... [NO]............... - [NO] ....... [NO] -transformer_inference .. [NO] ....... transformer_inference[OKAY] -.. [NO] ....... utils[OKAY] -.................. [NO] ....... utils[OKAY] -.................. [NO] quantizer....... ..............[OKAY] -[NO] ....... [OKAY]quantizer - .............. [NO] --------------------------------------------------....... - [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [WARNING]  async_io: please install the libaio-devel package with yum [OKAY] - -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version DeepSpeed general environment info:..................... 11.2 - -deepspeed install path ........... torch install path['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - deepspeed info............... ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ......['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] torch 1.8, cuda 11.1 - -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ...............DeepSpeed general environment info: -torch install path['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - ............... torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch cuda version - ............... torch version11.1 -....................nvcc version 1.8.1..................... - 11.2 -torch cuda version deepspeed install path............... ...........11.1 -nvcc version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']..................... - 11.2deepspeed info - deepspeed install path................... ...........0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']...... - deepspeed infotorch 1.8, cuda 11.1 -................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... async_io[NO] - ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.transformer_inference - .. [NO] ....... [OKAY] -transformer_inference ..utils [NO]async_io.................. ....... [NO] ............... [OKAY] ....... -[NO] [OKAY]....... - [NO]utils - quantizer.................. ..............[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer ..............--------------------------------------------------transformer_inference - [NO] ......... [NO][OKAY] -....... [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - ............... [NO] ....... [NO] -async_iotransformer_inference ................. [NO][NO] .............. [OKAY][NO] - -utils .................. [NO] ....... [OKAY] -quantizer ..............transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]async_io ....... ...............[NO] -[NO] ....... [NO] -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -utils ..................utils [NO] ......................... [NO][OKAY] -....... [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -....... [NO] -async_io transformer_inference............... ..[NO] [NO]....... .......[NO] -[OKAY] -utils .................. [NO] ....... [OKAY]transformer_inference - .. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] -utils ..................-------------------------------------------------- -[NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info:DeepSpeed general environment info: - - -torch install pathtorch install pathtorch install path ............................................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - - -torch versiontorch versiontorch version ............................................................ 1.8.11.8.11.8.1 - - -torch cuda versiontorch cuda versiontorch cuda version ............................................. 11.111.111.1 - - -nvcc versionnvcc versionnvcc version ............................................................... 11.211.211.2 - - -deepspeed install pathdeepspeed install pathdeepspeed install path ................................. ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - - -deepspeed infodeepspeed infodeepspeed info ......................................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - - -deepspeed wheel compiled w.deepspeed wheel compiled w.deepspeed wheel compiled w. .................. torch 1.8, cuda 11.1torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................torch version 1.8.1.................... - 1.8.1 -torch cuda version ...............torch cuda version 11.1............... - 11.1nvcc version - .....................nvcc version 11.2..................... - deepspeed install path11.2 -...........deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - ...................deepspeed info 0.5.5+29bee73, 29bee73, master................... - 0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - ......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info:['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version ....................torch install path 1.8.1 - ............... torch cuda version ............... 11.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']nvcc version - ..................... torch version11.2 -....................deepspeed install path 1.8.1........... - torch cuda version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']............... - deepspeed info11.1 -...................nvcc version 0.5.5+29bee73, 29bee73, master..................... - 11.2deepspeed wheel compiled w. - deepspeed install path...... ...........torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> setting tensorboard ... -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-22 18:28:35,800] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' ->>> done with dataset index builder. Compilation time: 0.303 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 25.888 seconds -time to initialize megatron (seconds): 78.047 -[after megatron is initialized] datetime: 2021-10-22 18:29:02 -building GPT model ... -[2021-10-22 18:29:02,120] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-22 18:29:02,121] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-22 18:29:02,121] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.03 GB, percent = 21.4% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-22 18:29:03,802] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - - - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -[2021-10-22 18:29:04,502] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-22 18:29:04,503] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-22 18:29:04,503] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.2 GB, percent = 21.5% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-22 18:29:04,504] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+29bee73, git-hash=29bee73, git-branch=master -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-22 18:29:04,541] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-22 18:29:04,541] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-22 18:29:04,541] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-22 18:29:04,542] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-22 18:29:04,542] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-22 18:29:04,542] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-22 18:29:04,542] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-22 18:29:04,542] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-22 18:29:04,542] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-22 18:29:04,542] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Time to load utils op: 1.0921554565429688 seconds -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Time to load utils op: 1.1780741214752197 secondsTime to load utils op: 1.1663405895233154 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - - -Time to load utils op: 1.1632099151611328 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 1.1792736053466797 secondsTime to load utils op: 1.182255506515503 secondsTime to load utils op: 1.185680866241455 seconds - - -Time to load utils op: 1.185666561126709 seconds -Time to load utils op: 1.1872801780700684 secondsTime to load utils op: 1.183830976486206 secondsTime to load utils op: 1.1868786811828613 seconds - - -Time to load utils op: 1.1926240921020508 seconds -Time to load utils op: 1.1894752979278564 secondsTime to load utils op: 1.193985939025879 secondsTime to load utils op: 1.1918563842773438 seconds - - -Time to load utils op: 1.1861236095428467 seconds -Time to load utils op: 1.183532953262329 seconds -Time to load utils op: 1.182396650314331 secondsTime to load utils op: 1.1855201721191406 seconds - -Time to load utils op: 1.1853570938110352 seconds -Time to load utils op: 1.1870622634887695 seconds -Time to load utils op: 1.1906986236572266 seconds -Time to load utils op: 1.191178560256958 seconds -Time to load utils op: 1.1916847229003906 seconds -Time to load utils op: 1.1948001384735107 secondsTime to load utils op: 1.1928622722625732 secondsTime to load utils op: 1.1893346309661865 secondsTime to load utils op: 1.1953864097595215 seconds - - - -Time to load utils op: 1.1961758136749268 seconds -Time to load utils op: 1.1909070014953613 seconds -Time to load utils op: 1.1915271282196045 secondsTime to load utils op: 1.1911966800689697 seconds - -Time to load utils op: 1.1896867752075195 seconds -Time to load utils op: 1.186239242553711 seconds -Time to load utils op: 1.1903300285339355 secondsTime to load utils op: 1.190995454788208 seconds - -Time to load utils op: 1.1872761249542236 seconds -Time to load utils op: 1.1909749507904053 secondsTime to load utils op: 1.1920750141143799 secondsTime to load utils op: 1.1917970180511475 seconds - -Time to load utils op: 1.1935746669769287 secondsTime to load utils op: 1.1863646507263184 secondsTime to load utils op: 1.184662103652954 seconds - -Time to load utils op: 1.1859581470489502 seconds -Time to load utils op: 1.1904394626617432 seconds -Time to load utils op: 1.190424919128418 secondsTime to load utils op: 1.1903162002563477 seconds - -Time to load utils op: 1.1942470073699951 seconds - - -Time to load utils op: 1.0839619636535645 secondsTime to load utils op: 1.0974063873291016 seconds -Time to load utils op: 1.097074031829834 seconds - -Time to load utils op: 1.1012611389160156 seconds -Time to load utils op: 1.1827380657196045 secondsTime to load utils op: 1.182737112045288 secondsTime to load utils op: 1.1833257675170898 seconds - - -Time to load utils op: 1.1852617263793945 secondsTime to load utils op: 1.1816432476043701 seconds - -Time to load utils op: 1.181161880493164 seconds -Time to load utils op: 1.184781551361084 secondsTime to load utils op: 1.1848835945129395 seconds - -Time to load utils op: 1.1905791759490967 secondsTime to load utils op: 1.192556381225586 secondsTime to load utils op: 1.1930129528045654 seconds - - -Time to load utils op: 1.1893703937530518 seconds -Time to load utils op: 1.185863971710205 secondsTime to load utils op: 1.1858327388763428 seconds -Time to load utils op: 1.1841635704040527 seconds - -Time to load utils op: 1.1857895851135254 seconds -Time to load utils op: 1.188225269317627 seconds -Time to load utils op: 1.1959540843963623 seconds -Time to load utils op: 1.1878750324249268 seconds -Time to load utils op: 1.1954319477081299 seconds -Time to load utils op: 1.1907052993774414 seconds -Time to load utils op: 1.1855642795562744 secondsTime to load utils op: 1.1845283508300781 seconds - -Time to load utils op: 1.1984584331512451 secondsTime to load utils op: 1.1847038269042969 seconds - -Time to load utils op: 1.1923136711120605 secondsTime to load utils op: 1.1920886039733887 seconds - -Time to load utils op: 1.1991586685180664 seconds -Time to load utils op: 1.1858468055725098 secondsTime to load utils op: 1.1842548847198486 seconds - -Time to load utils op: 1.1868107318878174 seconds -Time to load utils op: 1.1892352104187012 seconds -Time to load utils op: 1.1863431930541992 seconds -Time to load utils op: 1.1912565231323242 secondsTime to load utils op: 1.190610408782959 secondsTime to load utils op: 1.1903424263000488 seconds - - -Time to load utils op: 1.1893854141235352 secondsTime to load utils op: 1.186546802520752 secondsTime to load utils op: 1.1877250671386719 secondsTime to load utils op: 1.1907117366790771 seconds - - - -Time to load utils op: 1.1857829093933105 seconds -Time to load utils op: 1.188356876373291 secondsTime to load utils op: 1.1991848945617676 seconds - -Time to load utils op: 1.0981643199920654 seconds -Time to load utils op: 1.1980187892913818 secondsTime to load utils op: 1.1853604316711426 secondsTime to load utils op: 1.1852247714996338 seconds - - -Time to load utils op: 1.1840436458587646 seconds -Time to load utils op: 1.1878883838653564 seconds -Time to load utils op: 1.0941288471221924 seconds -Time to load utils op: 1.0943520069122314 secondsTime to load utils op: 1.0957441329956055 seconds - -Time to load utils op: 1.1859443187713623 seconds -Time to load utils op: 1.1984052658081055 seconds -Time to load utils op: 1.1855909824371338 seconds -Time to load utils op: 1.1857118606567383 seconds -Time to load utils op: 1.1948862075805664 secondsTime to load utils op: 1.1982452869415283 secondsTime to load utils op: 1.1892218589782715 seconds - - -Time to load utils op: 1.1890010833740234 seconds -Time to load utils op: 1.187697410583496 secondsTime to load utils op: 1.1875190734863281 seconds - -Time to load utils op: 1.1865684986114502 secondsTime to load utils op: 1.1842272281646729 secondsTime to load utils op: 1.187574863433838 seconds - - -Time to load utils op: 1.188019037246704 seconds -Time to load utils op: 1.1860601902008057 seconds -Time to load utils op: 1.1866354942321777 seconds -Time to load utils op: 1.1921167373657227 seconds -Time to load utils op: 1.1922132968902588 seconds -Time to load utils op: 1.192134141921997 secondsTime to load utils op: 1.1908316612243652 seconds - -Time to load utils op: 1.1874518394470215 secondsTime to load utils op: 1.1868841648101807 seconds -Time to load utils op: 1.1852307319641113 seconds - -Time to load utils op: 1.1865730285644531 seconds -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0026960372924804688 seconds -Time to load utils op: 0.002483367919921875 secondsTime to load utils op: 0.002634763717651367 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0012998580932617188 seconds -Time to load utils op: 0.0010313987731933594 seconds -Time to load utils op: 0.0010967254638671875 seconds -Time to load utils op: 0.0010802745819091797 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012714862823486328 seconds -Time to load utils op: 0.000995635986328125 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009961128234863281 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009837150573730469 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0013582706451416016 seconds -Time to load utils op: 0.0010190010070800781 secondsTime to load utils op: 0.001050710678100586 seconds - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011816024780273438 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009911060333251953 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013473033905029297 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0015382766723632812 seconds -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012924671173095703 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0014717578887939453 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.001100301742553711 seconds -Time to load utils op: 0.0012242794036865234 seconds -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001262664794921875 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011272430419921875 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010988712310791016 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010268688201904297 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0014963150024414062 secondsTime to load utils op: 0.001214742660522461 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011036396026611328 secondsTime to load utils op: 0.0010530948638916016 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010874271392822266 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012218952178955078 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010063648223876953 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001064300537109375 secondsTime to load utils op: 0.0012054443359375 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010471343994140625 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009903907775878906 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Loading extension module utils... - -Time to load utils op: 0.0011501312255859375 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010833740234375 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011906623840332031 seconds -Time to load utils op: 0.001130819320678711 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.001440286636352539 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010447502136230469 seconds -Time to load utils op: 0.001249074935913086 secondsTime to load utils op: 0.0010385513305664062 seconds -Time to load utils op: 0.0010800361633300781 seconds - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Time to load utils op: 0.001241922378540039 seconds -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.0010404586791992188 seconds -Time to load utils op: 0.0014619827270507812 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010890960693359375 seconds -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0014483928680419922 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0013782978057861328 seconds -Time to load utils op: 0.0013997554779052734 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.0010879039764404297 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012125968933105469 seconds -Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.001050710678100586 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Time to load utils op: 0.001033782958984375 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012385845184326172 seconds -Time to load utils op: 0.0012905597686767578 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011742115020751953 seconds -Time to load utils op: 0.0011260509490966797 seconds -Loading extension module utils... -Time to load utils op: 0.0011167526245117188 seconds -Time to load utils op: 0.0014221668243408203 seconds -Time to load utils op: 0.0011835098266601562 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010707378387451172 seconds -Time to load utils op: 0.0013043880462646484 seconds -Time to load utils op: 0.0014393329620361328 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010654926300048828 seconds -Time to load utils op: 0.0013363361358642578 seconds -Time to load utils op: 0.0010204315185546875 seconds -Time to load utils op: 0.0013217926025390625 secondsTime to load utils op: 0.0010361671447753906 seconds - -Time to load utils op: 0.0012404918670654297 secondsTime to load utils op: 0.0013911724090576172 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012700557708740234 seconds -Time to load utils op: 0.0013744831085205078 seconds -Time to load utils op: 0.0014846324920654297 secondsTime to load utils op: 0.0012562274932861328 seconds - -Time to load utils op: 0.0012099742889404297 seconds -Time to load utils op: 0.0015120506286621094 seconds -Time to load utils op: 0.001157999038696289 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001287221908569336 seconds -Time to load utils op: 0.0013818740844726562 seconds -Time to load utils op: 0.0011267662048339844 secondsTime to load utils op: 0.0011296272277832031 seconds - -Time to load utils op: 0.0013225078582763672 seconds -Time to load utils op: 0.0012676715850830078 seconds -Time to load utils op: 0.0015094280242919922 seconds -Time to load utils op: 0.0013973712921142578 seconds -Time to load utils op: 0.001547098159790039 seconds -Time to load utils op: 0.0011608600616455078 seconds -Time to load utils op: 0.001421213150024414 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013811588287353516 secondsTime to load utils op: 0.0012068748474121094 seconds - -Time to load utils op: 0.0012369155883789062 seconds -Time to load utils op: 0.0012483596801757812 secondsTime to load utils op: 0.0014431476593017578 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010595321655273438 seconds -Time to load utils op: 0.0010542869567871094 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.002034425735473633 seconds -Time to load utils op: 0.001974344253540039 seconds -Time to load utils op: 0.0019376277923583984 seconds -Time to load utils op: 0.0022072792053222656 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013778209686279297 seconds -Time to load utils op: 0.0012845993041992188 seconds -Time to load utils op: 0.0011565685272216797 seconds -[2021-10-22 18:29:07,491] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -Time to load utils op: 0.0012488365173339844 seconds -Time to load utils op: 0.0012938976287841797 secondsTime to load utils op: 0.0010938644409179688 seconds - -Time to load utils op: 0.00113677978515625 seconds -Time to load utils op: 0.0010135173797607422 seconds -Time to load utils op: 0.0024199485778808594 seconds -Time to load utils op: 0.0021665096282958984 seconds -[2021-10-22 18:29:07,492] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -Time to load utils op: 0.003129720687866211 seconds -Time to load utils op: 0.003068208694458008 seconds -[2021-10-22 18:29:07,492] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.22 GB, percent = 21.5% -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012657642364501953 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011029243469238281 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009989738464355469 seconds -Time to load utils op: 0.00098419189453125 seconds -Time to load utils op: 0.0013327598571777344 seconds -Time to load utils op: 0.0013167858123779297 seconds -Time to load utils op: 0.0012409687042236328 secondsTime to load utils op: 0.0012412071228027344 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013110637664794922 seconds -Time to load utils op: 0.0012788772583007812 seconds -Time to load utils op: 0.0012242794036865234 seconds -Time to load utils op: 0.0011944770812988281 seconds -[2021-10-22 18:29:07,538] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-22 18:29:07,539] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-22 18:29:07,539] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.22 GB, percent = 21.5% -[2021-10-22 18:29:07,539] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-22 18:29:07,568] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-22 18:29:07,568] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-22 18:29:07,568] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.22 GB, percent = 21.5% -[2021-10-22 18:29:07,569] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-22 18:29:07,569] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-22 18:29:07,569] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-22 18:29:07,569] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-22 18:29:07,569] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-22 18:29:07,571] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0008153915405273438 seconds -[2021-10-22 18:29:07,572] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) - > using checkpoint value 6e-05 for learning rate - > using checkpoint value 6e-06 for minimum learning rate - > using checkpoint value 216320 for warmup iterations - > using checkpoint value 600000000 for total number of iterations - > using checkpoint value cosine for decay style -successfully loaded 1 ZeRO state_dicts for rank 86 -successfully loaded 1 ZeRO state_dicts for rank 85 -successfully loaded 1 ZeRO state_dicts for rank 42 -successfully loaded 1 ZeRO state_dicts for rank 84 -successfully loaded 1 ZeRO state_dicts for rank 91 -successfully loaded 1 ZeRO state_dicts for rank 89 -successfully loaded 1 ZeRO state_dicts for rank 88 -successfully loaded 1 ZeRO state_dicts for rank 113 -successfully loaded 1 ZeRO state_dicts for rank 93 -successfully loaded 1 ZeRO state_dicts for rank 96 -successfully loaded 1 ZeRO state_dicts for rank 41 -successfully loaded 1 ZeRO state_dicts for rank 90 -successfully loaded 1 ZeRO state_dicts for rank 34 -successfully loaded 1 ZeRO state_dicts for rank 95 -successfully loaded 1 ZeRO state_dicts for rank 101 -successfully loaded 1 ZeRO state_dicts for rank 115 -successfully loaded 1 ZeRO state_dicts for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 10 -successfully loaded 1 ZeRO state_dicts for rank 112 -successfully loaded 1 ZeRO state_dicts for rank 87 -successfully loaded 1 ZeRO state_dicts for rank 40 -successfully loaded 1 ZeRO state_dicts for rank 32 -successfully loaded 1 ZeRO state_dicts for rank 20 -successfully loaded 1 ZeRO state_dicts for rank 38 -successfully loaded 1 ZeRO state_dicts for rank 4 -successfully loaded 1 ZeRO state_dicts for rank 30 -successfully loaded 1 ZeRO state_dicts for rank 97 -successfully loaded 1 ZeRO state_dicts for rank 98 -successfully loaded 1 ZeRO state_dicts for rank 22 -successfully loaded 1 ZeRO state_dicts for rank 35 -loading 1 zero partition checkpoints for rank 85 -successfully loaded 1 ZeRO state_dicts for rank 16 -successfully loaded 1 ZeRO state_dicts for rank 12 -successfully loaded 1 ZeRO state_dicts for rank 24 -successfully loaded 1 ZeRO state_dicts for rank 50 -loading 1 zero partition checkpoints for rank 86 -successfully loaded 1 ZeRO state_dicts for rank 75 -successfully loaded 1 ZeRO state_dicts for rank 39 -successfully loaded 1 ZeRO state_dicts for rank 77 -successfully loaded 1 ZeRO state_dicts for rank 33 -successfully loaded 1 ZeRO state_dicts for rank 118 -successfully loaded 1 ZeRO state_dicts for rank 17 -successfully loaded 1 ZeRO state_dicts for rank 8 -successfully loaded 1 ZeRO state_dicts for rank 66 -successfully loaded 1 ZeRO state_dicts for rank 107 -successfully loaded 1 ZeRO state_dicts for rank 25 -successfully loaded 1 ZeRO state_dicts for rank 83 -loading 1 zero partition checkpoints for rank 91 -successfully loaded 1 ZeRO state_dicts for rank 15 -successfully loaded 1 ZeRO state_dicts for rank 37 -successfully loaded 1 ZeRO state_dicts for rank 49 -successfully loaded 1 ZeRO state_dicts for rank 11 -successfully loaded 1 ZeRO state_dicts for rank 114 -successfully loaded 1 ZeRO state_dicts for rank 79 -successfully loaded 1 ZeRO state_dicts for rank 117 -successfully loaded 1 ZeRO state_dicts for rank 47 -loading 1 zero partition checkpoints for rank 113 -successfully loaded 1 ZeRO state_dicts for rank 116 -successfully loaded 1 ZeRO state_dicts for rank 45 -successfully loaded 1 ZeRO state_dicts for rank 19 -loading 1 zero partition checkpoints for rank 89 -successfully loaded 1 ZeRO state_dicts for rank 52 -successfully loaded 1 ZeRO state_dicts for rank 28 -successfully loaded 1 ZeRO state_dicts for rank 61 -successfully loaded 1 ZeRO state_dicts for rank 36 -successfully loaded 1 ZeRO state_dicts for rank 64 -successfully loaded 1 ZeRO state_dicts for rank 26 -successfully loaded 1 ZeRO state_dicts for rank 29 -successfully loaded 1 ZeRO state_dicts for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 43 -successfully loaded 1 ZeRO state_dicts for rank 14 -successfully loaded 1 ZeRO state_dicts for rank 63 -successfully loaded 1 ZeRO state_dicts for rank 27 -successfully loaded 1 ZeRO state_dicts for rank 111 -successfully loaded 1 ZeRO state_dicts for rank 23 -successfully loaded 1 ZeRO state_dicts for rank 104 -successfully loaded 1 ZeRO state_dicts for rank 103 -successfully loaded 1 ZeRO state_dicts for rank 56 -successfully loaded 1 ZeRO state_dicts for rank 76 -successfully loaded 1 ZeRO state_dicts for rank 80 -successfully loaded 1 ZeRO state_dicts for rank 70 -successfully loaded 1 ZeRO state_dicts for rank 31 -successfully loaded 1 ZeRO state_dicts for rank 21 -successfully loaded 1 ZeRO state_dicts for rank 69 -successfully loaded 1 ZeRO state_dicts for rank 120 -loading 1 zero partition checkpoints for rank 41 -successfully loaded 1 ZeRO state_dicts for rank 119 -successfully loaded 1 ZeRO state_dicts for rank 46 -loading 1 zero partition checkpoints for rank 115 -successfully loaded 1 ZeRO state_dicts for rank 108 -successfully loaded 1 ZeRO state_dicts for rank 78 -successfully loaded 1 ZeRO state_dicts for rank 44 -successfully loaded 1 ZeRO state_dicts for rank 102 -successfully loaded 1 ZeRO state_dicts for rank 121 -successfully loaded 1 ZeRO state_dicts for rank 123 -successfully loaded 1 ZeRO state_dicts for rank 62 -successfully loaded 1 ZeRO state_dicts for rank 9 -successfully loaded 1 ZeRO state_dicts for rank 73 -successfully loaded 1 ZeRO state_dicts for rank 105 -successfully loaded 1 ZeRO state_dicts for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 54 -successfully loaded 1 ZeRO state_dicts for rank 106 -successfully loaded 1 ZeRO state_dicts for rank 71 -loading 1 zero partition checkpoints for rank 42 -loading 1 zero partition checkpoints for rank 87 -successfully loaded 1 ZeRO state_dicts for rank 122 -successfully loaded 1 ZeRO state_dicts for rank 65 -loading 1 zero partition checkpoints for rank 84 -successfully loaded 1 ZeRO state_dicts for rank 109 -loading 1 zero partition checkpoints for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 59 -successfully loaded 1 ZeRO state_dicts for rank 13 -loading 1 zero partition checkpoints for rank 4 -loading 1 zero partition checkpoints for rank 22 -successfully loaded 1 ZeRO state_dicts for rank 18 -loading 1 zero partition checkpoints for rank 40 -loading 1 zero partition checkpoints for rank 93 -loading 1 zero partition checkpoints for rank 35 -loading 1 zero partition checkpoints for rank 50 -successfully loaded 1 ZeRO state_dicts for rank 110 -loading 1 zero partition checkpoints for rank 88 -loading 1 zero partition checkpoints for rank 75 -loading 1 zero partition checkpoints for rank 96 -loading 1 zero partition checkpoints for rank 90 -loading 1 zero partition checkpoints for rank 77 -loading 1 zero partition checkpoints for rank 10 -successfully loaded 1 ZeRO state_dicts for rank 6 -loading 1 zero partition checkpoints for rank 33 -loading 1 zero partition checkpoints for rank 98 -successfully loaded 1 ZeRO state_dicts for rank 5 -loading 1 zero partition checkpoints for rank 34 -loading 1 zero partition checkpoints for rank 107 -loading 1 zero partition checkpoints for rank 66 -successfully loaded 1 ZeRO state_dicts for rank 60 -successfully loaded 1 ZeRO state_dicts for rank 127 -loading 1 zero partition checkpoints for rank 95 -loading 1 zero partition checkpoints for rank 30 -loading 1 zero partition checkpoints for rank 49 -loading 1 zero partition checkpoints for rank 38 -loading 1 zero partition checkpoints for rank 39 -successfully loaded 1 ZeRO state_dicts for rank 124 -loading 1 zero partition checkpoints for rank 17 -successfully loaded 1 ZeRO state_dicts for rank 55 -loading 1 zero partition checkpoints for rank 79 -loading 1 zero partition checkpoints for rank 101 -successfully loaded 1 ZeRO state_dicts for rank 125 -successfully loaded 1 ZeRO state_dicts for rank 74 -loading 1 zero partition checkpoints for rank 45 -successfully loaded 1 ZeRO state_dicts for rank 72 -loading 1 zero partition checkpoints for rank 15 -loading 1 zero partition checkpoints for rank 8 -successfully loaded 1 ZeRO state_dicts for rank 82 -loading 1 zero partition checkpoints for rank 117 -loading 1 zero partition checkpoints for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 7 -loading 1 zero partition checkpoints for rank 112 -loading 1 zero partition checkpoints for rank 111 -successfully loaded 1 ZeRO state_dicts for rank 92 -loading 1 zero partition checkpoints for rank 32 -successfully loaded 1 ZeRO state_dicts for rank 126 -loading 1 zero partition checkpoints for rank 52 -loading 1 zero partition checkpoints for rank 20 -successfully loaded 1 ZeRO state_dicts for rank 53 -loading 1 zero partition checkpoints for rank 70 -successfully loaded 1 ZeRO state_dicts for rank 94 -loading 1 zero partition checkpoints for rank 21 -successfully loaded 1 ZeRO state_dicts for rank 1 -loading 1 zero partition checkpoints for rank 97 -loading 1 zero partition checkpoints for rank 12 -loading 1 zero partition checkpoints for rank 69 -loading 1 zero partition checkpoints for rank 16 -loading 1 zero partition checkpoints for rank 14 -successfully loaded 1 ZeRO state_dicts for rank 48 -loading 1 zero partition checkpoints for rank 31 -loading 1 zero partition checkpoints for rank 119 -successfully loaded 1 ZeRO state_dicts for rank 81 -loading 1 zero partition checkpoints for rank 24loading 1 zero partition checkpoints for rank 26 - -loading 1 zero partition checkpoints for rank 123 -loading 1 zero partition checkpoints for rank 27 -loading 1 zero partition checkpoints for rank 46 -successfully loaded 1 ZeRO state_dicts for rank 0 -loading 1 zero partition checkpoints for rank 121 -loading 1 zero partition checkpoints for rank 105 -loading 1 zero partition checkpoints for rank 103 -loading 1 zero partition checkpoints for rank 118 -loading 1 zero partition checkpoints for rank 62 -loading 1 zero partition checkpoints for rank 83 -loading 1 zero partition checkpoints for rank 25 -loading 1 zero partition checkpoints for rank 65 -successfully loaded 1 ZeRO state_dicts for rank 57 -loading 1 zero partition checkpoints for rank 37 -loading 1 zero partition checkpoints for rank 73 -loading 1 zero partition checkpoints for rank 54 -loading 1 zero partition checkpoints for rank 114 -loading 1 zero partition checkpoints for rank 102 -loading 1 zero partition checkpoints for rank 11 -successfully loaded 1 ZeRO state_dicts for rank 58 -loading 1 zero partition checkpoints for rank 47 -successfully loaded 1 ZeRO state_dicts for rank 3 -successfully loaded 1 ZeRO state_dicts for rank 51 -loading 1 zero partition checkpoints for rank 116 -loading 1 zero partition checkpoints for rank 19 -loading 1 zero partition checkpoints for rank 36 -loading 1 zero partition checkpoints for rank 28 -loading 1 zero partition checkpoints for rank 64 -loading 1 zero partition checkpoints for rank 18 -loading 1 zero partition checkpoints for rank 61 -loading 1 zero partition checkpoints for rank 43 -loading 1 zero partition checkpoints for rank 29 -loading 1 zero partition checkpoints for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 2 -loading 1 zero partition checkpoints for rank 23 -loading 1 zero partition checkpoints for rank 5 -loading 1 zero partition checkpoints for rank 63 -loading 1 zero partition checkpoints for rank 109 -loading 1 zero partition checkpoints for rank 104 -loading 1 zero partition checkpoints for rank 56 -loading 1 zero partition checkpoints for rank 76 -loading 1 zero partition checkpoints for rank 80 -loading 1 zero partition checkpoints for rank 60 -loading 1 zero partition checkpoints for rank 78 -loading 1 zero partition checkpoints for rank 120 -loading 1 zero partition checkpoints for rank 108 -loading 1 zero partition checkpoints for rank 44 -loading 1 zero partition checkpoints for rank 9 -loading 1 zero partition checkpoints for rank 127 -loading 1 zero partition checkpoints for rank 68 -loading 1 zero partition checkpoints for rank 106 -loading 1 zero partition checkpoints for rank 71 -loading 1 zero partition checkpoints for rank 122 -loading 1 zero partition checkpoints for rank 125 -loading 1 zero partition checkpoints for rank 59 -loading 1 zero partition checkpoints for rank 13 -loading 1 zero partition checkpoints for rank 82 -loading 1 zero partition checkpoints for rank 110 -loading 1 zero partition checkpoints for rank 92 -loading 1 zero partition checkpoints for rank 6 -loading 1 zero partition checkpoints for rank 94 -loading 1 zero partition checkpoints for rank 1 -loading 1 zero partition checkpoints for rank 55 -loading 1 zero partition checkpoints for rank 81 -loading 1 zero partition checkpoints for rank 72 -loading 1 zero partition checkpoints for rank 74 -loading 1 zero partition checkpoints for rank 57 -loading 1 zero partition checkpoints for rank 7 -loading 1 zero partition checkpoints for rank 58 -loading 1 zero partition checkpoints for rank 53 -loading 1 zero partition checkpoints for rank 124 -loading 1 zero partition checkpoints for rank 48 -loading 1 zero partition checkpoints for rank 126 -loading 1 zero partition checkpoints for rank 51 -loading 1 zero partition checkpoints for rank 0 - checkpoint version 3.0 -loading 1 zero partition checkpoints for rank 3 -loading 1 zero partition checkpoints for rank 2 - successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints at iteration 34 -time (ms) | load-checkpoint: 13304.77 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504estimated model parameters: 125.2213504 - -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters: 125.22432 -estimated model parameters: 125.22432 -estimated model parameters: 125.2213504 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 125.22432 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064estimated model parameters without embeddings: 103.368064 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-22 18:29:21 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 3000320 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.125407 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.349 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.248 seconds - total number of samples: 6927161 - total number of epochs: 1 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.080 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-22 18:29:27 -done with setup ... -training ... -time (ms) | model-and-optimizer-setup: 19311.12 | train/valid/test-data-iterators-setup: 5548.88 -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billionNumber of parameters: 125.2213504 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-22 18:29:27 -[2021-10-22 18:29:27,694] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-22 18:29:27,694] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-22 18:29:27,694] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-22 18:29:27,694] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-22 18:29:27,695] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -[Rank 2] (after 35 iterations) memory (MB) | allocated: 13203.47900390625 | max allocated: 20667.02783203125 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 6] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 126] (after 35 iterations) memory (MB) | allocated: 13082.6953125 | max allocated: 20546.30126953125 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 10] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 18] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 14] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 26] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 34] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 30] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 22] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 1] (after 35 iterations) memory (MB) | allocated: 13202.11962890625 | max allocated: 20665.66845703125 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 0] (after 35 iterations) memory (MB) | allocated: 13201.28759765625 | max allocated: 20664.83642578125 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 9] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 4] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0[Rank 5] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 - -[Rank 8] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 124] (after 35 iterations) memory (MB) | allocated: 13082.482421875 | max allocated: 20546.08837890625 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 13] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 125] (after 35 iterations) memory (MB) | allocated: 13082.94921875 | max allocated: 20546.55517578125 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 12] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 17] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 16] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 21] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 20] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 25] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 33] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 29] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 28] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 32] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 42] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 46] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 50] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 24] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 38] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 58] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 62] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 70] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 66] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 86] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 54] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 74] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 82] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 78] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 90] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 94] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 98] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 102] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 106] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 110] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 -[Rank 114] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 118] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 41] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 40] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 36] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 37] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 122] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 48] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 49] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 45] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 3] (after 35 iterations) memory (MB) | allocated: 13201.53564453125 | max allocated: 20665.08447265625 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 44] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 7] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 19] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 11] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 15] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 23] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 27] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 31] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 35] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 43] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 39] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 47] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 51] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 55] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 59] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 67] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 75] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 63] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 71] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 83] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 79] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 57] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 61] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 65] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 60] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 53] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 68] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 69] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 56] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 73] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 64] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 95] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 77] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 91] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 52] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 72] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 81] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 103] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 85] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 99] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 80] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 89] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0[Rank 88] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 - -[Rank 107] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 87] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 92] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 111] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 -[Rank 97] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 93] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 115] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 96] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 101] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 109] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 104] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 119] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 105] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 108] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 -[Rank 123] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 117] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 113] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 76] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 121] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 116] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 120] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 84] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 100] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 112] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 - iteration 35/ 292968 | consumed samples: 71680 | consumed tokens: 4587520 | elapsed time per iteration (ms): 170231.7 | learning rate: 1.988E-05 | global batch size: 2048 | lm loss: 1.020244E+01 | loss scale: 4096.0 | grad norm: 232297.002 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -[Rank 127] (after 35 iterations) memory (MB) | allocated: 13082.57666015625 | max allocated: 20546.1826171875 | reserved: 24406.0 | max reserved: 24406.0 -time (ms) - iteration 36/ 292968 | consumed samples: 73728 | consumed tokens: 4718592 | elapsed time per iteration (ms): 95192.8 | learning rate: 2.045E-05 | global batch size: 2048 | lm loss: 1.179706E+01 | loss scale: 4096.0 | grad norm: 394431.999 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 37/ 292968 | consumed samples: 75776 | consumed tokens: 4849664 | elapsed time per iteration (ms): 94263.9 | learning rate: 2.102E-05 | global batch size: 2048 | lm loss: 1.159876E+01 | loss scale: 4096.0 | grad norm: 309552.600 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 38/ 292968 | consumed samples: 77824 | consumed tokens: 4980736 | elapsed time per iteration (ms): 94613.8 | learning rate: 2.159E-05 | global batch size: 2048 | lm loss: 1.126956E+01 | loss scale: 4096.0 | grad norm: 326011.438 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 39/ 292968 | consumed samples: 79872 | consumed tokens: 5111808 | elapsed time per iteration (ms): 95822.0 | learning rate: 2.215E-05 | global batch size: 2048 | lm loss: 1.047825E+01 | loss scale: 4096.0 | grad norm: 181115.439 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 40/ 292968 | consumed samples: 81920 | consumed tokens: 5242880 | elapsed time per iteration (ms): 96049.2 | learning rate: 2.272E-05 | global batch size: 2048 | lm loss: 1.009597E+01 | loss scale: 4096.0 | grad norm: 105708.713 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 41/ 292968 | consumed samples: 83968 | consumed tokens: 5373952 | elapsed time per iteration (ms): 96857.1 | learning rate: 2.329E-05 | global batch size: 2048 | lm loss: 9.645950E+00 | loss scale: 4096.0 | grad norm: 54189.229 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 42/ 292968 | consumed samples: 86016 | consumed tokens: 5505024 | elapsed time per iteration (ms): 96536.5 | learning rate: 2.386E-05 | global batch size: 2048 | lm loss: 9.366836E+00 | loss scale: 4096.0 | grad norm: 36765.384 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 43/ 292968 | consumed samples: 88064 | consumed tokens: 5636096 | elapsed time per iteration (ms): 97014.4 | learning rate: 2.443E-05 | global batch size: 2048 | lm loss: 9.295312E+00 | loss scale: 4096.0 | grad norm: 101399.317 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 44/ 292968 | consumed samples: 90112 | consumed tokens: 5767168 | elapsed time per iteration (ms): 104666.0 | learning rate: 2.499E-05 | global batch size: 2048 | lm loss: 9.078954E+00 | loss scale: 4096.0 | grad norm: 45212.899 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 45/ 292968 | consumed samples: 92160 | consumed tokens: 5898240 | elapsed time per iteration (ms): 96895.5 | learning rate: 2.556E-05 | global batch size: 2048 | lm loss: 9.004776E+00 | loss scale: 4096.0 | grad norm: 64467.756 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 46/ 292968 | consumed samples: 94208 | consumed tokens: 6029312 | elapsed time per iteration (ms): 95869.1 | learning rate: 2.613E-05 | global batch size: 2048 | lm loss: 8.858628E+00 | loss scale: 4096.0 | grad norm: 34756.107 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 47/ 292968 | consumed samples: 96256 | consumed tokens: 6160384 | elapsed time per iteration (ms): 95837.6 | learning rate: 2.670E-05 | global batch size: 2048 | lm loss: 8.663449E+00 | loss scale: 4096.0 | grad norm: 48155.205 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 48/ 292968 | consumed samples: 98304 | consumed tokens: 6291456 | elapsed time per iteration (ms): 95739.1 | learning rate: 2.727E-05 | global batch size: 2048 | lm loss: 8.545946E+00 | loss scale: 4096.0 | grad norm: 47054.317 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 49/ 292968 | consumed samples: 100352 | consumed tokens: 6422528 | elapsed time per iteration (ms): 94691.8 | learning rate: 2.783E-05 | global batch size: 2048 | lm loss: 8.737078E+00 | loss scale: 4096.0 | grad norm: 147984.860 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 50/ 292968 | consumed samples: 102400 | consumed tokens: 6553600 | elapsed time per iteration (ms): 96272.3 | learning rate: 2.840E-05 | global batch size: 2048 | lm loss: 8.645372E+00 | loss scale: 4096.0 | grad norm: 100115.276 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 51/ 292968 | consumed samples: 104448 | consumed tokens: 6684672 | elapsed time per iteration (ms): 96225.8 | learning rate: 2.897E-05 | global batch size: 2048 | lm loss: 8.786609E+00 | loss scale: 4096.0 | grad norm: 138446.949 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 52/ 292968 | consumed samples: 106496 | consumed tokens: 6815744 | elapsed time per iteration (ms): 93767.5 | learning rate: 2.954E-05 | global batch size: 2048 | lm loss: 8.520951E+00 | loss scale: 4096.0 | grad norm: 72259.747 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 53/ 292968 | consumed samples: 108544 | consumed tokens: 6946816 | elapsed time per iteration (ms): 95896.3 | learning rate: 3.011E-05 | global batch size: 2048 | lm loss: 8.274112E+00 | loss scale: 4096.0 | grad norm: 30192.728 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 54/ 292968 | consumed samples: 110592 | consumed tokens: 7077888 | elapsed time per iteration (ms): 94348.2 | learning rate: 3.067E-05 | global batch size: 2048 | lm loss: 8.363799E+00 | loss scale: 4096.0 | grad norm: 70109.113 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 55/ 292968 | consumed samples: 112640 | consumed tokens: 7208960 | elapsed time per iteration (ms): 96086.5 | learning rate: 3.124E-05 | global batch size: 2048 | lm loss: 8.283342E+00 | loss scale: 4096.0 | grad norm: 32869.639 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -Killing subprocess 4027110 -slurmstepd: error: *** STEP 1655850.0 ON r6i4n5 CANCELLED AT 2021-10-22T19:05:02 *** -Killing subprocess 4027111 -Killing subprocess 4027112 -Killing subprocess 1360613 -Killing subprocess 4027114 -Main process received SIGTERM, exiting -Killing subprocess 512071 -Killing subprocess 1360614 -Killing subprocess 392892 -Killing subprocess 1360615 -Killing subprocess 2123183 -Killing subprocess 1360617 -Killing subprocess 392893 -Killing subprocess 512072 -Killing subprocess 512073 -Killing subprocess 2123184 -Killing subprocess 512074 -Killing subprocess 392894 -Main process received SIGTERM, exiting -Killing subprocess 1339268 -Main process received SIGTERM, exiting -Killing subprocess 4161749 -Killing subprocess 392895 -Killing subprocess 4117263 -Main process received SIGTERM, exiting -Killing subprocess 1339269 -Killing subprocess 642447 -Killing subprocess 4161750 -Killing subprocess 4161751 -Killing subprocess 1311333 -Killing subprocess 1795309 -Killing subprocess 1084264 -Killing subprocess 2354914 -Killing subprocess 4161753 -Killing subprocess 1311334 -Killing subprocess 2123185 -Killing subprocess 1036398 -Killing subprocess 1795310 -Killing subprocess 105206 -Killing subprocess 2123186 -Killing subprocess 1084265 -Main process received SIGTERM, exiting -Killing subprocess 1795311 -Killing subprocess 2354347 -Killing subprocess 2354915 -Killing subprocess 642448 -Killing subprocess 533215 -Killing subprocess 1084266 -Killing subprocess 1339270 -Killing subprocess 105207 -Killing subprocess 4117264 -Killing subprocess 23037 -Killing subprocess 19863 -Killing subprocess 1311335 -Killing subprocess 3570489 -Killing subprocess 837305 -Killing subprocess 743334 -Killing subprocess 1339271 -Killing subprocess 4117265 -Killing subprocess 2354348 -Killing subprocess 1036399 -Killing subprocess 3570490 -Killing subprocess 1484262 -Killing subprocess 533216 -Killing subprocess 743335 -Killing subprocess 1084268 -Main process received SIGTERM, exiting -Killing subprocess 4117266 -Killing subprocess 23038 -Killing subprocess 19864 -Killing subprocess 2354349 -Killing subprocess 1036400 -Killing subprocess 2354916 -Killing subprocess 1795313 -Killing subprocess 837306 -Killing subprocess 105208 -Main process received SIGTERM, exiting -Killing subprocess 23039 -Killing subprocess 19865 -Killing subprocess 2354350 -Killing subprocess 2354918 -Killing subprocess 3570491 -Killing subprocess 2376378 -Main process received SIGTERM, exiting -Killing subprocess 1484263 -Killing subprocess 837307 -Killing subprocess 1380399 -Killing subprocess 642449 -Killing subprocess 533217 -Killing subprocess 743336 -Killing subprocess 105209 -Killing subprocess 19867 -Killing subprocess 1311336 -Killing subprocess 572722 -Killing subprocess 700598 -Main process received SIGTERM, exiting -Killing subprocess 2376379 -Killing subprocess 2581538 -Killing subprocess 837309 -Killing subprocess 1380400 -Killing subprocess 642450 -Killing subprocess 533218 -Killing subprocess 743337 -Main process received SIGTERM, exiting -Killing subprocess 23040 -Main process received SIGTERM, exiting -Killing subprocess 1036401 -Main process received SIGTERM, exiting -Killing subprocess 572723 -Killing subprocess 1729138 -Killing subprocess 2376380 -Killing subprocess 1484264 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 572724 -Killing subprocess 700599 -Killing subprocess 1959733 -Killing subprocess 1618027 -Killing subprocess 1654112 -Killing subprocess 2376381 -Killing subprocess 1484266 -Killing subprocess 2581539 -Killing subprocess 1380401 -Main process received SIGTERM, exiting -Killing subprocess 700600 -Killing subprocess 3570492 -Main process received SIGTERM, exiting -Killing subprocess 1380403 -Main process received SIGTERM, exiting -Killing subprocess 1959734 -Killing subprocess 1618028 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 1729139 -Killing subprocess 1654113 -Killing subprocess 2581540 -Killing subprocess 572725 -Killing subprocess 700601 -Killing subprocess 1618029 -Killing subprocess 2581542 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Killing subprocess 1959735 -Main process received SIGTERM, exiting -Killing subprocess 1618030 -Killing subprocess 1654114 -Killing subprocess 1729140 -Killing subprocess 1729141 -Killing subprocess 1959737 -Main process received SIGTERM, exiting -Killing subprocess 1654116 -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -Main process received SIGTERM, exiting -srun: Job step aborted: Waiting up to 62 seconds for job step to finish. -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninjaJIT compiled ops requires ninja - - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamninja cpu_adamninja............... .................. [NO]................................. [OKAY][NO] .......[OKAY] - -.......[OKAY]-------------------------------------------------- - -------------------------------------------------- -[OKAY] -op name - op name................ ................installed installed.. fused_adam..compatible compatible -.............fused_adam --------------------------------------------------- -------------------------------------------------- -............. -[NO] .......[NO] [OKAY]....... - [OKAY]cpu_adam - cpu_adam............... fused_lamb ............... [NO]fused_lamb [NO]............. ........................... [NO] [OKAY] [OKAY][NO] - -.............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - -fused_adam ............. [NO] fused_adam....... .............[OKAY] --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -[NO]sparse_attnsparse_attn fused_lamb ....... ............ .........................[OKAY] --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [NO][NO][NO] .......fused_lamb.............. [OKAY][OKAY]............. - -[OKAY] -[NO]transformer transformer....... ............[OKAY]............ - sparse_attn[NO][NO] .......................... [NO][OKAY][OKAY] -....... - [OKAY]sparse_attn - stochastic_transformer............stochastic_transformer transformer [NO] .............. .......[NO][NO][NO] ....... [OKAY] .............. -[OKAY] [OKAY] -[OKAY] -transformer - stochastic_transformer............ [NO] ........ [NO][OKAY] -....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninjacpu_adam .................................ninja [OKAY][NO] -.................. .......--------------------------------------------------[OKAY] - -[OKAY]op name --------------------------------------------------- -................ installedop name .. ................compatible -fused_adaminstalled-------------------------------------------------- - ............. ..[NO] compatible....... - [OKAY]-------------------------------------------------- - -cpu_adam ............... fused_lamb[NO] .............cpu_adam....... [NO]...............[OKAY] -.......[NO] [OKAY]....... - [OKAY] -fused_adam ............. [NO] .......fused_adamsparse_attn [OKAY]......................... - [NO][NO] fused_lamb....... ....... ............. [OKAY] [OKAY] -[NO] - ....... fused_lamb[OKAY]transformer - ......................... [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer . [NO]sparse_attn ................... sparse_attn[NO][OKAY] -................... [OKAY][NO] - ....... transformer[OKAY] -............ [NO] .......transformer [OKAY] -............ [NO] stochastic_transformer....... [OKAY]. - [NO] .......stochastic_transformer [OKAY] -. [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible -ninja-------------------------------------------------- - .................. [OKAY] --------------------------------------------------- -op namecpu_adam ............................... installed[NO] .. .......compatible -[OKAY] --------------------------------------------------- -cpu_adam ...............fused_adam [NO]............. .......[NO] [OKAY]....... - [OKAY] -fused_lamb ............. [NO] .......fused_adam [OKAY]............. - [NO] ....... [OKAY] -fused_lamb ............. [NO] sparse_attn....... ............[OKAY] -[NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -sparse_attn ............ stochastic_transformer[NO] ....... .[OKAY] -[NO] ....... transformer[OKAY] -............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY]ninja - --------------------------------------------------.................. - [OKAY]op name - ................-------------------------------------------------- -installed ..op name compatible................ - installed-------------------------------------------------- -.. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY]cpu_adam - ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] .......fused_adam [OKAY]............. - [NO] ....... [OKAY] -fused_lambsparse_attn ......................... [NO][NO] .............. [OKAY][OKAY] -transformer - ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] sparse_attn....... ............[OKAY] -[NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... ninja[NO] ....... ..................[OKAY] -[OKAY] --------------------------------------------------- -op name ................ installed ..fused_adam compatible............. -[NO]-------------------------------------------------- -....... [OKAY] -fused_lamb .............cpu_adam [NO] ...................... [OKAY][NO] - ....... [OKAY] -sparse_attnfused_adam ............ .............[NO] .......[NO] [OKAY]....... - [OKAY] -transformer ............ fused_lamb[NO] .................... [OKAY][NO] - ....... [OKAY]stochastic_transformer - . [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... ...............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb .............fused_lamb [NO]............. .......[NO] [OKAY]....... - [OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer stochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... ...............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_adam .............fused_adam [NO]............. .......[NO] [OKAY]....... - [OKAY] -fused_lamb ............. [NO]fused_lamb .................... [OKAY][NO] - ....... [OKAY] -sparse_attn ............ [NO]sparse_attn ................... [OKAY][NO] - ....... [OKAY] -transformer ............ [NO]transformer ................... [OKAY][NO] - ....... [OKAY] -stochastic_transformer .stochastic_transformer [NO] ........ [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. sparse_attncompatible ............ - --------------------------------------------------[NO] - ninja....... .................. [OKAY] -[OKAY] --------------------------------------------------- -cpu_adamtransformerop name ............................... ............installed[NO] [NO]......... ninja ....... compatible[OKAY].................. - - --------------------------------------------------[OKAY][OKAY] - - --------------------------------------------------- -fused_adamcpu_adam op name .............stochastic_transformer ............... ................[NO].[NO] installed.............. [NO][OKAY][OKAY] ......... - - compatible - --------------------------------------------------[OKAY] -fused_lamb - ............. [NO] fused_adam....... .............cpu_adam [OKAY] [NO] - ...................... [NO][OKAY] -....... fused_lamb[OKAY] ............. -sparse_attn [NO]............ ....... [NO][OKAY] -.......fused_adam [OKAY]............. - [NO] .......transformer [OKAY]............ - [NO] sparse_attn....... fused_lamb ............[OKAY] -[NO]............. .......[NO]stochastic_transformer [OKAY] ....... -. [OKAY][NO]transformer - ................... [NO][OKAY] ....... - [OKAY] -sparse_attn ............ stochastic_transformer[NO] ........ [OKAY][NO] - ....... transformer[OKAY] - ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ninja....... [OKAY] -.................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatiblesparse_attn -ninja --------------------------------------------------............ - ..................[NO] [OKAY]....... - --------------------------------------------------[OKAY]cpu_adam - -...............op name transformer [NO] ............................ .......installed[NO] ..[OKAY]....... -compatible -[OKAY]-------------------------------------------------- - -stochastic_transformer fused_adam. .............[NO]cpu_adam [NO]...................... .......[OKAY][NO] - [OKAY]....... - [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -sparse_attn ............fused_lamb [NO]............. [NO]....... ....... [OKAY][OKAY] - -transformer ............ [NO] ....... [OKAY] -stochastic_transformer sparse_attn ............. [NO][NO] .............. [OKAY][OKAY] - -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY]ninja - --------------------------------------------------.................. - [OKAY]op name - ................ --------------------------------------------------installed - op name.. ................compatible -installed-------------------------------------------------- -.. compatible --------------------------------------------------- -cpu_adam ............... [NO] cpu_adam....... ...............[OKAY] -[NO] ....... [OKAY] -fused_adam ............. [NO]fused_adam .................... [NO][OKAY] -....... [OKAY] -fused_lamb fused_lamb............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer . .[NO] ninja ninja[NO] ....... ........................................... [OKAY][OKAY][OKAY] - - -[OKAY]-------------------------------------------------- --------------------------------------------------- - -op name ................op name installed................ ..installed compatible.. - compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb fused_lamb............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer stochastic_transformer . .[NO] .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -ninjafused_lamb ............................... [NO][OKAY] - .......-------------------------------------------------- -[OKAY] -op name ................ installed .. compatible --------------------------------------------------- -sparse_attn ............ [NO] cpu_adam....... ...............[OKAY] -[NO] .......transformer [OKAY]............ - [NO] ....... [OKAY] -stochastic_transformer fused_adam .ninja............. [NO] [NO] ................................ [OKAY][OKAY][OKAY] - - --------------------------------------------------- -fused_lamb op name............. ................[NO] installed....... ..[OKAY] -compatible --------------------------------------------------- -cpu_adam sparse_attn............... ............[NO] [NO]....... .......[OKAY] -[OKAY] -ninjatransformer .............................. [NO]fused_adam[OKAY] -.................... --------------------------------------------------[OKAY][NO] - - .......op name [OKAY]stochastic_transformer................ - installed .fused_lamb.. [NO].............compatible -.......[NO]-------------------------------------------------- .......[OKAY] - -[OKAY] -cpu_adam ............... [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -fused_adamtransformer ......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambstochastic_transformer ............. .[NO] [NO]....... ....... [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatibleninja --------------------------------------------------- -.................. [OKAY] --------------------------------------------------- -op name cpu_adam................ ............... installed[NO] ......... [OKAY]compatible -ninja --------------------------------------------------- -ninja.................. ..................[OKAY] -fused_adam[OKAY] -.............--------------------------------------------------cpu_adam --------------------------------------------------[NO] - - op nameop name...................... ................................[OKAY] installed[NO] -installed .. ....... fused_lamb.. compatible -.............[OKAY]compatible-------------------------------------------------- -[NO] - - --------------------------------------------------....... -[OKAY] -cpu_adamfused_adam ............... cpu_adam.............[NO] ...............[NO]....... sparse_attn[NO][OKAY]....... - ............ ....... [OKAY][NO] - [OKAY]....... - fused_lamb[OKAY]fused_adam - ..........................transformer [NO][NO]............ fused_adam..............[NO] [OKAY]....................[OKAY] - - [NO][OKAY] -.......fused_lamb .............stochastic_transformer [OKAY][NO] ........sparse_attn [NO] - [OKAY]................... -fused_lamb [OKAY][NO] -............. .......[NO] [OKAY]....... - [OKAY]transformer - sparse_attn............ ............[NO] [NO] .............. [OKAY][OKAY]sparse_attn - -............ transformer[NO] stochastic_transformer............ .......[NO] .[OKAY]....... -[NO][OKAY] transformer -....... ............[OKAY] -[NO]stochastic_transformer ....... .[OKAY] -[NO] ....... [OKAY]stochastic_transformer - . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja .................. [OKAY] -fused_adam-------------------------------------------------- ninja -............. ninja ..................op name [NO] ..................[OKAY] ....................... -[OKAY] --------------------------------------------------[OKAY] - -installed --------------------------------------------------- op name - fused_lamb................ ..op name ............. installedcompatible ..................[NO] - compatible --------------------------------------------------.......installed - - --------------------------------------------------[OKAY] -.. - compatiblecpu_adam - ...............-------------------------------------------------- -cpu_adam[NO] ...................... sparse_attn[NO]cpu_adam ....... ............[OKAY] ...............[OKAY] -[NO] - .......[NO] [OKAY]....... - [OKAY] -transformerfused_adamfused_adam ...................................... [NO][NO] [NO]fused_adam.............. [OKAY]....................[OKAY] - [NO] -[OKAY] stochastic_transformer -....... fused_lamb [OKAY].fused_lamb............. - [NO].............[NO] fused_lamb ....... .......[NO] ............. [OKAY] [OKAY] ....... - -[NO] [OKAY]....... - [OKAY] -sparse_attnsparse_attn sparse_attn............ ........................[NO] [NO][NO] ....... ..............[OKAY] - [OKAY][OKAY] - -transformer transformer............transformer ............[NO]............ .......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -stochastic_transformer stochastic_transformerstochastic_transformer. [NO]. . ....... [NO] [NO] [OKAY] .............. - [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. [OKAY]cpu_adam - ............... --------------------------------------------------[NO] - .......op name [OKAY] -................ installed .. compatible --------------------------------------------------- -fused_adam ............. [NO] .......cpu_adam [OKAY]............... - [NO] ....... fused_lamb[OKAY] -............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] fused_lamb....... .............[OKAY] -[NO] .......transformer [OKAY]............ - [NO] ....... [OKAY] -stochastic_transformer . sparse_attn[NO] ................... [NO][OKAY] -....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb ............. fused_lamb[NO] .................... [NO][OKAY] -....... [OKAY] -sparse_attn ............sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformer ............transformer [NO]............ .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer .stochastic_transformer [NO] ........ [NO][OKAY] -....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja fused_adam.................. ............. [OKAY][NO] - .......-------------------------------------------------- [OKAY] - -op nameninja fused_lamb ............................................... [NO]ninja[OKAY]installed - ....... --------------------------------------------------.................. .. -[OKAY] -op name[OKAY]compatible -................ - ----------------------------------------------------------------------------------------------------installed - -op name.. compatible................ -sparse_attn -------------------------------------------------- installed -cpu_adam............ [NO]................. .......compatiblecpu_adam [NO] - [OKAY]--------------------------------------------------............... -....... -transformer [NO][OKAY] ............ - .......[NO] cpu_adam [OKAY]....... - ...............[OKAY] - fused_adam[NO] .................... stochastic_transformer[OKAY][NO] -fused_adam . ....................[NO] [NO]....... [OKAY] fused_adam.......[OKAY] - -[OKAY] -............. fused_lamb[NO] .............fused_lamb....... .............[NO][OKAY] -[NO]....... .......fused_lamb[OKAY] -[OKAY]............. - [NO] ....... [OKAY] -sparse_attn ............sparse_attn [NO]............ sparse_attn[NO]....... ............ ....... [OKAY] -[NO][OKAY] - .......transformer [OKAY]transformer............ - ............[NO]transformer [NO]................... .......[OKAY][NO] -[OKAY] -.......stochastic_transformerstochastic_transformer [OKAY] -.. [NO][NO]stochastic_transformer ....... ....... [OKAY][OKAY]. - - [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja .................. fused_adam[OKAY] ............. - [NO]-------------------------------------------------- -....... [OKAY]op name - ................ installed fused_lamb.. ............. compatible[NO] -....... --------------------------------------------------[OKAY] - -cpu_adam ............... ninja[NO] sparse_attn.........................ninja ............ [OKAY] [OKAY].................. -[NO] -[OKAY]....... - --------------------------------------------------[OKAY] --------------------------------------------------- - -op name fused_adam................op name transformer ............. ................installed ............ [NO].. installed [NO] ....... .. .......compatible -[OKAY]compatible[OKAY] - --------------------------------------------------- - ---------------------------------------------------fused_lamb - ............. [NO]stochastic_transformer .......cpu_adam . [OKAY] ...............cpu_adam -[NO] ...............[NO]....... [NO] [OKAY] -.............. [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -fused_adam fused_adam.............transformer ......................... [NO] [NO] [NO] ..................... [OKAY][OKAY][OKAY] - - -fused_lambstochastic_transformer fused_lamb ............. ..............[NO] [NO] [NO].............. .......[OKAY][OKAY] - -[OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -ninja .................. [OKAY]cpu_adamcpu_adam - ..............................-------------------------------------------------- -[NO][NO] op name.............. ................ [OKAY] [OKAY] -installed - .. compatible --------------------------------------------------- -fused_adam fused_adam............. .............[NO] cpu_adam [NO] ....... ............... ....... [OKAY] -[NO][OKAY] -....... fused_lamb[OKAY] fused_lamb - .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY] -sparse_attn sparse_attn............fused_lamb .........................[NO] [NO].......[NO] .......[OKAY]....... - [OKAY]transformer[OKAY] - -............ [NO]transformer ................... [OKAY][NO] - ....... [OKAY] -stochastic_transformer sparse_attn .............stochastic_transformer [NO][NO] ............... [NO][OKAY][OKAY] - -....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja .................. [OKAY]fused_adam -.............-------------------------------------------------- -[NO] .......op name [OKAY] -................ installed ..fused_lamb compatible............. - [NO]-------------------------------------------------- ....... - [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -fused_adamtransformer ......................... [NO][NO] ....... .......[OKAY] -[OKAY] -stochastic_transformerfused_lamb .............. [NO][NO] .............. [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninjasparse_attn .............................. [NO][OKAY] ....... - [OKAY]-------------------------------------------------- - -op nametransformer ............................ installed[NO] ......... compatible[OKAY] - --------------------------------------------------- -stochastic_transformer . ninja[NO]cpu_adam ........................................ [OKAY][OKAY][NO] - - --------------------------------------------------....... - [OKAY]op name - ................ installed .. compatible --------------------------------------------------- -fused_adam ............. [NO] ....... [OKAY] -cpu_adam ...............fused_lamb [NO]............. .......[NO] [OKAY] -....... [OKAY] -fused_adamsparse_attn ......................... [NO] [NO]....... .......[OKAY] -[OKAY] -fused_lambtransformer ............. ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer . [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja ..................ninja [OKAY] -.................. --------------------------------------------------[OKAY]cpu_adam - -op name............... --------------------------------------------------[NO]................ - installed.......op name ..[OKAY]................ - ninjacompatibleinstalled - ..................--------------------------------------------------.. - [OKAY]compatible - -----------------------------------------------------------------------------------------------------fused_adam -cpu_adam............. ............... -[NO] [NO]op name .......cpu_adam ....... [OKAY][OKAY]................ -............... - installed [NO] fused_lamb....... ...............fused_adam[OKAY] compatible -[NO]............. .......[NO] [OKAY] - -....... --------------------------------------------------fused_adam[OKAY] -............. -[NO] .......fused_lamb [OKAY] sparse_attn -............. ............[NO] fused_lambcpu_adam[NO] .......................................... [OKAY][OKAY] -[NO] -[NO] transformer ....... ............ ....... [OKAY] [NO] -[OKAY] -....... [OKAY]sparse_attn - ............ [NO]stochastic_transformer .......sparse_attn. ............[NO][OKAY] - .......[NO] transformer[OKAY]....... -fused_adam ............[OKAY]............. - [NO][NO] transformer ....... ....... ............ [OKAY] -[OKAY][NO] - ....... [OKAY] -stochastic_transformer .stochastic_transformer fused_lamb [NO]. .......[NO] [OKAY]....... - .............[OKAY] - [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja ..................ninja [OKAY] -..................-------------------------------------------------- -[OKAY] -op name --------------------------------------------------................ -installedninjaop name .. ..................................compatible -[OKAY]installed-------------------------------------------------- - -.. --------------------------------------------------compatible - -op name --------------------------------------------------................ - cpu_adaminstalled ................. [NO]cpu_adamcompatible -...................... --------------------------------------------------[OKAY] - -[NO] ....... [OKAY] -ninja cpu_adam.................. fused_adam[OKAY]............... - .............fused_adam--------------------------------------------------[NO] - [NO].............op name....... ....... [NO] ................[OKAY][OKAY] -installed -....... .. [OKAY]fused_lambcompatible - - .............-------------------------------------------------- -[NO]fused_lambfused_adam ................................. [OKAY]cpu_adam[NO][NO] - ............... .............. [NO][OKAY][OKAY] -....... - [OKAY] -fused_lambsparse_attn ......................... [NO][NO] sparse_attnfused_adam....... ....................[OKAY]............ [NO][OKAY] -[NO] -....... transformer [OKAY]................... - [NO][OKAY] fused_lamb -....... .............[OKAY]transformer -sparse_attn[NO] ............................... stochastic_transformer [NO] [OKAY] [NO] -....... . ....... [OKAY][NO] -[OKAY] -....... [OKAY]stochastic_transformer -transformer sparse_attn............. ............[NO][NO] [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - -stochastic_transformertransformer ............ .[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -cpu_adamop name ............................... installed .. [NO]compatible -.......ninja--------------------------------------------------ninja - .................................... [OKAY][OKAY][OKAY] - - --------------------------------------------------- -cpu_adam-------------------------------------------------- -op name ................ op nameinstalled............... ..................[NO] installedcompatible....... - ..[OKAY]-------------------------------------------------- - fused_adam -compatible -............. --------------------------------------------------[NO] - ....... [OKAY]cpu_adam -fused_adam ............................ cpu_adam [NO] [NO]fused_lamb............... ........................... [NO] [OKAY][NO][OKAY] -....... - fused_lamb.......[OKAY] -[OKAY]............. - [NO] ....... [OKAY] -fused_adam ............. fused_adam[NO] .................... [NO][OKAY] -....... sparse_attn[OKAY]sparse_attnfused_lamb - ............ .........................fused_lamb[NO] [NO] [NO] ........................... [OKAY] ....... -[NO][OKAY] .......transformer[OKAY] - - [OKAY]............ -transformer [NO]............ .......[NO] [OKAY] -.......stochastic_transformer sparse_attn [OKAY] -.sparse_attn............ [NO] ............ stochastic_transformer[NO]....... [NO][OKAY]....... . - ....... [OKAY] -[NO][OKAY] -.......transformer [OKAY]transformer............ - ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... [NO] async_io....... [NO]............... - [NO] ....... [NO] -transformer_inference .. [NO] .......transformer_inference [OKAY].. - [NO] ....... utils[OKAY] -.................. [NO] ....... utils[OKAY] -.................. [NO] .......quantizer [OKAY].............. - [NO] ....... quantizer[OKAY] -.............. [NO] ....... --------------------------------------------------[OKAY] - --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -nvcc version ..................... 11.2 -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -async_io ............... [NO] ....... [NO] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -transformer_inference .. [NO] ....... [OKAY] -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -utils .................. [NO] ....... [OKAY] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ...............['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']1.8.1 - -torch cuda versiontorch version ................................... 1.8.111.1 - -nvcc versiontorch cuda version .................................... 11.2 -11.1deepspeed install path - nvcc version........... ..................... 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed install pathdeepspeed info .............................. 0.5.5+29bee73, 29bee73, master -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed wheel compiled w. - deepspeed info...... ...................torch 1.8, cuda 11.1 -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninjacpu_adamninja ................................. .................. [NO] [OKAY] [OKAY] -....... - --------------------------------------------------[OKAY]-------------------------------------------------- - -op name - ninjaop name................ ................installed.................. fused_adam installed [OKAY]............... - ..--------------------------------------------------[NO]compatible - -op name.......compatible --------------------------------------------------................[OKAY] - - -installed-------------------------------------------------- -..fused_lamb compatible............. - cpu_adam[NO]-------------------------------------------------- -cpu_adam...................... ............... [NO] [OKAY] [NO]cpu_adam -....... ......................[OKAY] -[OKAY][NO] - ....... [OKAY] -sparse_attn ............ fused_adam[NO] fused_adam.................... fused_adam [OKAY] [NO]............. - ............. .......transformer [NO] [NO] [OKAY]............ ....... ....... -[NO] [OKAY]fused_lamb....... [OKAY] -............. - [OKAY] -[NO]fused_lamb fused_lamb....................stochastic_transformer [OKAY].............. [NO] -[NO] .......[NO] ..............[OKAY] -[OKAY][OKAY] - -sparse_attn ............ [NO] .......sparse_attn sparse_attn[OKAY]............ -............[NO] transformer.......[NO] ............[OKAY]....... - [NO][OKAY] -.......transformer transformer[OKAY]............ - ............[NO] stochastic_transformer.......[NO] [OKAY]....... -. [OKAY][NO] - stochastic_transformer....... stochastic_transformer[OKAY]. - [NO]. ....... [NO][OKAY] -....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] async_io....... [NO]............... - [NO] ....... [NO] -transformer_inference .. [NO] ....... transformer_inference[OKAY] -.. [NO] ....... [OKAY]utils - .................. [NO] ....... utils[OKAY] -.................. [NO] .......quantizer [OKAY].............. - [NO] ....... [OKAY] -quantizer .............. [NO] --------------------------------------------------....... - [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ninja installed .................... [OKAY]compatible - ----------------------------------------------------------------------------------------------------- - -op name ................ installed .. compatiblecpu_adamninja - ninja ...............-------------------------------------------------- .................. .................. -[NO] [OKAY][OKAY]....... - -[OKAY]cpu_adam-------------------------------------------------- - -------------------------------------------------- -............... - op name[NO] op name ....................... ................installed[OKAY]fused_adam -installed .. ............. ..compatible[NO] - compatible.......-------------------------------------------------- - ---------------------------------------------------fused_adam[OKAY] - -............. [NO]fused_lamb .................... [OKAY]cpu_adam[NO] -cpu_adam .....................................fused_lamb [OKAY] [NO].............[NO] - [NO].............. .......[OKAY][OKAY] - -[OKAY] -sparse_attn ............ [NO] ....... [OKAY]fused_adam -fused_adam .............sparse_attn............. transformer[NO]............ [NO]............[NO] ....... [NO].............. ....... [OKAY] [OKAY] -[OKAY][OKAY] - - -fused_lamb fused_lambstochastic_transformer............. transformer .......................... [NO][NO][NO][NO] ..................... ....... [OKAY][OKAY] - -[OKAY][OKAY] - -stochastic_transformer . [NO] ....... [OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]async_io ....... [NO] - ............... [NO]transformer_inference ......... [NO][NO] ....... [OKAY] - -utils .................. [NO] .......transformer_inference [OKAY] - .. [NO]quantizer ....... ..............[OKAY] -[NO] ....... [OKAY]utils - .................. -------------------------------------------------- -[NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]async_io ....... [NO]............... - [NO] ....... [NO] -transformer_inference .. [NO] ....... transformer_inference[OKAY] -.. [NO] ....... [OKAY]utils - .................. [NO] .......utils [OKAY].................. - [NO] ....... [OKAY]quantizer - .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... async_io ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']............... - deepspeed info[NO] .......................... [NO]0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... DeepSpeed general environment info:1.8.1 - -torch cuda version ............... 11.1 -torch install pathnvcc version .................................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -deepspeed info torch version................... ....................0.5.5+29bee73, 29bee73, master -1.8.1deepspeed wheel compiled w. - ...... torch cuda versiontorch 1.8, cuda 11.1 -............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info:['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version ....................torch install path 1.8.1 - ............... torch cuda version ............... 11.1 -nvcc version['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -..................... 11.2torch version - deepspeed install path.................... ...........1.8.1 -torch cuda version['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -............... deepspeed info11.1 -................... nvcc version0.5.5+29bee73, 29bee73, master -..................... deepspeed wheel compiled w.11.2 -...... deepspeed install pathtorch 1.8, cuda 11.1 -........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io .............................. [NO] [NO]....... .......[NO] -[NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1DeepSpeed general environment info: -nvcc version ..................... - 11.2 -deepspeed install path ...........torch install path ...............['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1torch version - .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -ninja .................. [OKAY] --------------------------------------------------- -op name ninja................ installed .................... compatible[OKAY] - --------------------------------------------------- --------------------------------------------------- -op name ................ installed ..cpu_adam compatible............... -[NO]-------------------------------------------------- ....... - [OKAY] -cpu_adamninja ............... ..................[NO] [OKAY]fused_adam....... - .............--------------------------------------------------ninja[OKAY] - [NO] -..................op name .......[OKAY] -................[OKAY] ---------------------------------------------------installed -fused_lamb op namefused_adam ............... ................ compatible[NO] ............. -.......installed --------------------------------------------------[OKAY].. - [NO] -compatible -.......-------------------------------------------------- -[OKAY]cpu_adam - ............... [NO]fused_lamb ....... .............[OKAY] sparse_attn -cpu_adam [NO] ............ ............... .......[NO][NO] [OKAY]..............fused_adam - [OKAY][OKAY]............. - - [NO] ....... transformer[OKAY] ............ - sparse_attn[NO] fused_adam................... fused_lamb ............. [NO][OKAY] ............. - [NO].......[NO] stochastic_transformer....... [OKAY]....... - .[OKAY][OKAY] -transformer -[NO] ...................fused_lamb .............[OKAY] -[NO][NO] sparse_attn.............. [OKAY]............[OKAY] - -[NO] ....... [OKAY]stochastic_transformer - .transformer [NO]............ .......[NO] sparse_attn [OKAY] ....... -............ [OKAY][NO] - ....... [OKAY] -stochastic_transformer transformer ............. [NO] [NO] ....... .......[OKAY] - [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] -async_iotransformer_inference .. ...............[NO] [NO]....... .......[OKAY] -[NO] -utils .................. [NO] ....... [OKAY] -transformer_inference ..quantizer .............. [NO] ....... [OKAY] -[NO] ....... [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] async_io....... [NO]............... - [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. utils[NO] ......................... [NO][OKAY] -....... [OKAY] -quantizerutils ................................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version ...............torch cuda version 11.1............... - nvcc version11.1 -.....................nvcc version 11.2..................... - deepspeed install path11.2 -...........deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info -...................deepspeed info 0.5.5+29bee73, 29bee73, master................... - deepspeed wheel compiled w.0.5.5+29bee73, 29bee73, master -......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yumtransformer_inference .. -[NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -DeepSpeed general environment info: -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................DeepSpeed general environment info: 1.8.1 - -torch cuda version ............... 11.1torch install path -nvcc version .................................... 11.2 -deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -torch versiondeepspeed info ....................................... 1.8.10.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w. torch cuda version...... ...............torch 1.8, cuda 11.1 -11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inferenceasync_io .. ...............[NO] [NO]....... .......[OKAY] -[NO] -utils .................. [NO] ....... [OKAY] -transformer_inference .. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] -utils-------------------------------------------------- -.................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- - --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - - --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -JIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -ninjaninjaninja ninja .................................... .................. ..................[OKAY][OKAY] - -[OKAY][OKAY]-------------------------------------------------- -op nameop name op nameop name ................ ................ ................................ installedinstalled installed installed.... ..compatible..compatible - -compatible--------------------------------------------------compatible-------------------------------------------------- - - ------------------------------------------------------------------------------------------------------------------------------------------------------- -op name - - - - --------------------------------------------------- --------------------------------------------------- - op name................op nameop name ................................installed................ installed installed.. installed ..compatible .. ..compatible - --------------------------------------------------- compatible-------------------------------------------------- -cpu_adam ............... cpu_adam[NO]cpu_adam cpu_adam..................................... [NO]...............[NO][OKAY] -[NO].............. .......[OKAY] [OKAY] -[OKAY] -compatible - - ----------------------------------------------------------------------------------------------------- - - -fused_adam ............. [NO] ....... [OKAY] -cpu_adam cpu_adam............... ...............cpu_adamcpu_adam[NO] [NO] ...................... ...............[OKAY] .......[NO] -fused_adam fused_adamfused_adam............. fused_lamb............. .......................... [NO] [NO][NO][NO] ....... ....... [OKAY]....... -....... [OKAY] [OKAY] -[OKAY] - [NO] [OKAY] ....... - -....... [OKAY][OKAY] - -fused_lambfused_lamb .............fused_lamb ............. [NO] ............. [NO] ....... [NO] ....... sparse_attn[OKAY]....... -fused_adam fused_adam............. .............[NO] [NO]....... .......[OKAY]fused_adamfused_adam - ............[OKAY][OKAY] - -[NO] ....... [OKAY] - [OKAY]..........................fused_lamb - [NO].............[NO] fused_lamb .......[NO] ....... ............. [OKAY] [OKAY].......[NO] - - [OKAY]....... -transformer ............ [NO] ....... [OKAY]sparse_attnsparse_attn - fused_lamb[OKAY]fused_lamb - ............sparse_attn............ stochastic_transformer............ [NO] [NO] [NO] ........ ....... ....... [NO][OKAY] [OKAY][OKAY] - -....... - [OKAY]transformer - .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -transformertransformer ............ ............ ............[NO] [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - -transformer sparse_attnsparse_attntransformer............ ............[NO]............ ............ ....... [NO] [NO][NO][OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer ... [NO] [NO] [NO]....... .......[OKAY]....... - [OKAY][OKAY] - - .............. ....... [OKAY]stochastic_transformer [OKAY][OKAY] - - -. transformer[NO]stochastic_transformertransformer ................... ............. [OKAY] [NO][NO] - [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - -stochastic_transformerstochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] -ninjaninjaninjaninja .................................... .................. .................. [OKAY] [OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop name op name ................op name................ ................installed ................ installed..installedinstalled ....compatible.. -compatiblecompatible - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -compatible --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam ...............[NO]............... [NO][NO]....... ..............[OKAY] cpu_adam -[OKAY][OKAY] - -............... [NO] .......fused_adam [OKAY]fused_adam - fused_adam.......................... .............[NO][NO] [NO].............. .......[OKAY][OKAY] - -[OKAY] -fused_lamb fused_lamb............. fused_lamb fused_adam .............[NO] ............. .............[NO]....... [NO].......[OKAY] -....... [OKAY][NO] - [OKAY]....... - [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] - -[OKAY] -fused_lambsparse_attn sparse_attn......................... sparse_attn............[NO] ............[NO].......[NO] [NO].......[OKAY]....... - ----------------------------------------------------------------------------------------------------- - -----------------------------------------------------------------------------------------------------op name -op name - .......[OKAY][OKAY] - -transformer[OKAY] - ................op name................ installedop name installed................ .................... installedcompatiblecompatible installed - -.. -------------------------------------------------- -------------------------------------------------- -.. -compatible -compatible --------------------------------------------------- --------------------------------------------------- -............transformer transformer[NO]............ ...................[NO] [NO][OKAY]....... - .......[OKAY] -[OKAY] -cpu_adam ...............cpu_adam [NO]............... cpu_adam .......cpu_adam [NO] [OKAY]............... ............... -....... [NO][NO][OKAY] -.............. [OKAY][OKAY] - -stochastic_transformer stochastic_transformer. sparse_attn stochastic_transformer . [NO]............ .[NO]....... [NO].......[OKAY] -fused_adam ............. [NO] ....... [OKAY] -[NO].......[OKAY] -.......[OKAY] -[OKAY] -fused_adam .............fused_adam fused_adamfused_lamb[NO] ................................. ............. [OKAY][NO][NO] -transformer ............ [NO] ....... [OKAY] - [NO] ....... fused_lamb .............. [OKAY] [OKAY]............. - -stochastic_transformer . [NO] ....... [OKAY] - [OKAY][NO] - fused_lamb....... [OKAY].............fused_lamb - .............[NO] sparse_attn[NO]....... ............ [OKAY] ....... -[NO] [OKAY]....... - [OKAY]sparse_attn - ............ transformer[NO] ................... [NO][OKAY] -sparse_attn....... transformer[OKAY]............ sparse_attn -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op nameop name - ............[NO] stochastic_transformer............ [NO] ....... .[NO] ....... [NO][OKAY][OKAY] -op name op name................ ................ ................ installed................ installed ..installed installed .. .. compatible.. -compatible -------------------------------------------------- -compatible ---------------------------------------------------compatible - - --------------------------------------------------- --------------------------------------------------- - -....... .......transformerstochastic_transformer [OKAY] - [OKAY]............ -cpu_adam ...............cpu_adam [NO]cpu_adam cpu_adam............... ....... ............... ...............[NO][OKAY] -[NO][NO]....... ..............[OKAY] -[OKAY][OKAY] -. transformer [NO] [NO] .......................... [OKAY][NO][OKAY] - -....... [OKAY]stochastic_transformer - - .stochastic_transformer [NO] ........ [NO][OKAY] -fused_adam ............. [NO] .......fused_adamfused_adam fused_adam [OKAY] .......................... -....... [OKAY] -............. fused_lamb[NO][NO][NO] ........................... ....... [OKAY] [OKAY][NO] -[OKAY] - -.......fused_lamb [OKAY]fused_lamb............. -fused_lamb ..........................[NO] [NO] ....... [NO] ....... [OKAY] ....... -[OKAY] sparse_attn -[OKAY] -............ [NO] ....... [OKAY] -transformersparse_attn ............sparse_attn............ sparse_attn [NO]............[NO]............ .......[NO].......[NO] [OKAY] [OKAY] - -.............. transformer[OKAY][OKAY]stochastic_transformer -............ - .transformer [NO]transformer ...............................[NO] [OKAY][NO][NO] - ....... ....... ....... [OKAY]stochastic_transformer [OKAY] -[OKAY] -. - [NO] stochastic_transformer....... stochastic_transformer[OKAY] -.. [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] - - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------op name--------------------------------------------------op name - - op name................op name................ installed ................ installed................ ..installed ..compatibleinstalled -..compatible-------------------------------------------------- -.. -compatible-------------------------------------------------- - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [NO] cpu_adam....... ...............cpu_adam[OKAY]cpu_adam - [NO].............................. .......[NO] [NO] ....... [OKAY] - fused_adam[OKAY] -.................... [OKAY][NO] -fused_adam....... [OKAY].............fused_adam - [NO]............. fused_lamb ....... fused_adam [NO]............. [OKAY].......[NO]............. - [OKAY]....... fused_lamb -[NO][OKAY] -....................fused_lamb [NO][OKAY]............. - .......[NO] fused_lamb [OKAY] .......sparse_attn - .............[OKAY] -............[NO] [NO] .............. [OKAY][OKAY]sparse_attn - - ............ [NO]sparse_attntransformer ............................... [NO][NO][OKAY] ....... - .......sparse_attn [OKAY] [OKAY] -transformer............ - transformer[NO]............stochastic_transformer [NO] ................... .......[NO]. [OKAY][NO] -.......[OKAY] -.......[OKAY]transformer -[OKAY] stochastic_transformer -............ stochastic_transformer [NO]. ........ [NO][NO][OKAY] ....... -....... [OKAY][OKAY]stochastic_transformer - - . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] - - -[OKAY]------------------------------------------------------------------------------------------------------------------------------------------------------ - - - -op nameop name --------------------------------------------------op name -................................ op name................installedinstalled .. ................ installed..compatible -installed ..-------------------------------------------------- compatible - -..compatible-------------------------------------------------- - compatible --------------------------------------------------- - -cpu_adam-------------------------------------------------- -............... [NO] ....... cpu_adam[OKAY]cpu_adam -............... cpu_adam ...............[NO] ...............[NO]....... [NO][OKAY]....... -fused_adam[OKAY]....... - .............[OKAY] [NO] - ....... [OKAY] -fused_adam fused_lamb fused_adam.......................... fused_adam[NO].............[NO] ....................[NO]....... [OKAY][NO] ....... -[OKAY] -.......[OKAY]fused_lamb - [OKAY] -............. [NO]fused_lambfused_lamb ................................. [OKAY] -[NO]sparse_attn[NO] .......................... [NO][OKAY] [OKAY]....... - - [OKAY] -sparse_attn ............transformer [NO]............ .......[NO] [OKAY]....... - [OKAY]sparse_attn -sparse_attn ............transformer ............stochastic_transformer [NO] [NO]............ . .......[NO][NO]....... [OKAY] [OKAY].............. - -[OKAY][OKAY]transformer - -transformer ........................ [NO][NO]stochastic_transformer .............. .[OKAY][OKAY] - -[NO] ....... [OKAY]stochastic_transformer -stochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -JIT compiled ops requires ninja------------------------------------------------------------------------------------------------------------------------------------------------------ - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op name -op nameop name op name ................................................ ................installedinstalledinstalled ....installed .. compatible compatible -..compatible --------------------------------------------------- --------------------------------------------------- -compatible --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... cpu_adamcpu_adamcpu_adam[NO] .................................................... [NO] [OKAY][NO] [NO] - ..................... [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO] ....... fused_adamfused_adamfused_adam[OKAY] -.......................... ............. [NO] [NO] [NO] fused_lamb .............. ....... ............. [OKAY] [OKAY] -[OKAY] -[NO] - .......fused_lamb fused_lamb[OKAY]fused_lamb - ....................................... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO] ....... [OKAY] -sparse_attnsparse_attnsparse_attn transformer.................................... ............[NO][NO][NO] [NO].............. ....... .......[OKAY] [OKAY] -[OKAY] -[OKAY] - -transformertransformer transformer........................ ............[NO]stochastic_transformer[NO] [NO].............. . [OKAY]....... [OKAY] - [NO] -[OKAY] -.......stochastic_transformer stochastic_transformer [OKAY] -stochastic_transformer.. [NO][NO]. ..............[NO] [OKAY][OKAY]....... - - [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - --------------------------------------------------- - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - - op name................op nameop name installed ................................ ................ .. installedinstalled installed compatible.... - ..-------------------------------------------------- compatiblecompatible -compatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [NO]cpu_adamcpu_adamcpu_adam .............................. ............... [NO] [NO]....... [NO] ..............[OKAY]....... -[OKAY] [OKAY] -[OKAY] - -fused_adam fused_adam.............fused_adam .............fused_adam[NO]............. [NO]....... .............[NO]....... [OKAY].......[OKAY] [NO] - - [OKAY] -.......fused_lamb fused_lamb[OKAY] -fused_lamb ............. ............. ............. [NO] [NO] [NO] ....... fused_lamb....... ....... [OKAY][OKAY] .............[OKAY] - - -[NO] ....... [OKAY] -sparse_attnsparse_attn sparse_attn........................ ............[NO][NO] [NO].............. [OKAY]....... [OKAY] -[OKAY] - -sparse_attntransformer transformertransformer ............ ........................ ............ [NO][NO][NO][NO] ..................... ....... [OKAY] [OKAY][OKAY] - -[OKAY] - -stochastic_transformer transformerstochastic_transformerstochastic_transformer . ............. . [NO][NO] [NO] .......[NO] .............. [OKAY].......[OKAY][OKAY] - - - [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -.. [NO] ....... [OKAY] -utils async_io.................. [NO] ...................... [NO][OKAY] -....... [NO] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... ...............[NO] [NO]....... [NO]....... - [NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO]transformer_inference - .. [NO] ....... [OKAY] -utilstransformer_inference .................... [NO][NO] .............. [OKAY][OKAY] - -quantizerutils ................................ [NO] [NO]....... .......[OKAY] -[OKAY] ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... DeepSpeed general environment info:torch 1.8, cuda 11.1 - -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'].................... - 1.8.1 -torch version ....................torch cuda version 1.8.1............... - 11.1 -torch cuda versionnvcc version .................................... 11.111.2 - -nvcc versiondeepspeed install path ................................ 11.2 -deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']........... - deepspeed info ...................['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -0.5.5+29bee73, 29bee73, masterdeepspeed info - deepspeed wheel compiled w.................... ......0.5.5+29bee73, 29bee73, master -torch 1.8, cuda 11.1deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch version - .................... torch version1.8.1 -.................... torch cuda version1.8.1 -............... 11.1torch cuda version - ...............nvcc version .....................11.1 -11.2nvcc version - deepspeed install path..................... ...........11.2 -deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']........... - deepspeed info ...................['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -0.5.5+29bee73, 29bee73, master -deepspeed info deepspeed wheel compiled w.................... ......0.5.5+29bee73, 29bee73, master -torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io ............... [NO]............... .......[NO] [NO]....... - [NO] -transformer_inference .. [NO] .......transformer_inference [OKAY] -.. [NO] ....... utils[OKAY] -.................. [NO] ....... [OKAY]utils - .................. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] -quantizer .............. [NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -nvcc version ..................... 11.2 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1656313.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.006 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_bnb_optimizer ............................... False - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> setting tensorboard ... -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-22 19:41:40,451] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' ->>> done with dataset index builder. Compilation time: 0.298 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 24.643 seconds -time to initialize megatron (seconds): -18.561 -[after megatron is initialized] datetime: 2021-10-22 19:42:05 -building GPT model ... -[2021-10-22 19:42:05,492] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-22 19:42:05,492] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-22 19:42:05,493] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.67 GB, percent = 20.7% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-22 19:42:07,164] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -[2021-10-22 19:42:07,851] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-22 19:42:07,852] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-22 19:42:07,852] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.84 GB, percent = 20.7% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-22 19:42:07,853] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+29bee73, git-hash=29bee73, git-branch=master -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-22 19:42:07,890] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-22 19:42:07,890] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-22 19:42:07,890] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-22 19:42:07,891] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-22 19:42:07,891] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-22 19:42:07,891] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-22 19:42:07,891] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-22 19:42:07,891] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-22 19:42:07,891] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-22 19:42:07,891] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Time to load utils op: 0.5484132766723633 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 0.5761208534240723 seconds -Time to load utils op: 0.579134464263916 secondsTime to load utils op: 0.5763051509857178 seconds - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... -Loading extension module utils... -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.5823333263397217 seconds -Time to load utils op: 0.576514482498169 seconds -Time to load utils op: 0.5769612789154053 seconds -Time to load utils op: 0.5810577869415283 seconds -Time to load utils op: 0.5780692100524902 secondsTime to load utils op: 0.573697566986084 seconds - -Time to load utils op: 0.5726234912872314 seconds -Time to load utils op: 0.573002815246582 seconds -Time to load utils op: 0.5759055614471436 secondsTime to load utils op: 0.5748276710510254 secondsTime to load utils op: 0.5745711326599121 secondsTime to load utils op: 0.5859580039978027 seconds - - - -Time to load utils op: 0.5763602256774902 secondsTime to load utils op: 0.5826587677001953 seconds - -Time to load utils op: 0.5736472606658936 secondsTime to load utils op: 0.5762660503387451 seconds - -Time to load utils op: 0.5771477222442627 seconds -Time to load utils op: 0.5766696929931641 secondsTime to load utils op: 0.5810379981994629 seconds -Time to load utils op: 0.58457350730896 seconds - -Time to load utils op: 0.5773873329162598 seconds -Time to load utils op: 0.57682204246521 seconds -Time to load utils op: 0.5758914947509766 seconds -Time to load utils op: 0.5768764019012451 seconds -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.5781011581420898 seconds -Time to load utils op: 0.5822818279266357 seconds -Time to load utils op: 0.5792884826660156 secondsTime to load utils op: 0.585716724395752 seconds - -Loading extension module utils... -Time to load utils op: 0.5742568969726562 seconds -Time to load utils op: 0.5768544673919678 secondsTime to load utils op: 0.5742383003234863 secondsTime to load utils op: 0.5741832256317139 seconds - - -Time to load utils op: 0.5748293399810791 secondsTime to load utils op: 0.5751030445098877 seconds - -Time to load utils op: 0.5746893882751465 seconds -Time to load utils op: 0.5744655132293701 seconds -Time to load utils op: 0.5767889022827148 secondsTime to load utils op: 0.5793983936309814 seconds - -Time to load utils op: 0.57710862159729 seconds -Time to load utils op: 0.5806419849395752 seconds -Time to load utils op: 0.5808627605438232 secondsTime to load utils op: 0.5790245532989502 secondsTime to load utils op: 0.5844216346740723 seconds - - -Time to load utils op: 0.5740690231323242 secondsTime to load utils op: 0.573939323425293 seconds - -Time to load utils op: 0.5783731937408447 seconds -Time to load utils op: 0.5746238231658936 seconds -Time to load utils op: 0.5728816986083984 seconds -Time to load utils op: 0.582676887512207 secondsTime to load utils op: 0.5758557319641113 secondsTime to load utils op: 0.5754649639129639 secondsTime to load utils op: 0.5750665664672852 seconds - - - -Time to load utils op: 0.5752723217010498 seconds -Time to load utils op: 0.575629472732544 seconds -Time to load utils op: 0.574432373046875 seconds -Time to load utils op: 0.5744929313659668 seconds -Time to load utils op: 0.5822789669036865 seconds -Time to load utils op: 0.5824587345123291 seconds -Loading extension module utils... -Time to load utils op: 0.5787477493286133 seconds -Time to load utils op: 0.5801963806152344 seconds -Time to load utils op: 0.5791168212890625 seconds -Time to load utils op: 0.58355712890625 secondsTime to load utils op: 0.5887613296508789 seconds - -Time to load utils op: 0.5803275108337402 seconds -Time to load utils op: 0.5754895210266113 secondsTime to load utils op: 0.5777885913848877 seconds - -Time to load utils op: 0.5764744281768799 seconds -Time to load utils op: 0.5756151676177979 seconds -Time to load utils op: 0.5779409408569336 secondsTime to load utils op: 0.57574462890625 secondsTime to load utils op: 0.5779969692230225 secondsTime to load utils op: 0.5838079452514648 seconds - - - -Time to load utils op: 0.5801951885223389 secondsTime to load utils op: 0.5801787376403809 secondsTime to load utils op: 0.581043004989624 seconds - - -Time to load utils op: 0.5817883014678955 seconds -Time to load utils op: 0.5764765739440918 secondsTime to load utils op: 0.5762426853179932 secondsTime to load utils op: 0.5764944553375244 secondsTime to load utils op: 0.5764751434326172 seconds - - - -Time to load utils op: 0.5086030960083008 seconds -Loading extension module utils... -Time to load utils op: 0.5033237934112549 seconds -Time to load utils op: 0.5765295028686523 secondsTime to load utils op: 0.5777430534362793 secondsTime to load utils op: 0.5781176090240479 secondsTime to load utils op: 0.5788986682891846 seconds - - - -Time to load utils op: 0.5792365074157715 secondsTime to load utils op: 0.5788767337799072 secondsTime to load utils op: 0.5776636600494385 secondsTime to load utils op: 0.5786256790161133 seconds - - -Time to load utils op: 0.5762784481048584 secondsTime to load utils op: 0.5767679214477539 secondsTime to load utils op: 0.5773322582244873 seconds - - - -Time to load utils op: 0.577890157699585 seconds -Loading extension module utils... -Time to load utils op: 0.5886774063110352 seconds -Time to load utils op: 0.584235668182373 secondsTime to load utils op: 0.5847251415252686 seconds - -Time to load utils op: 0.5825867652893066 seconds -Time to load utils op: 0.5030839443206787 seconds -Time to load utils op: 0.5047187805175781 secondsTime to load utils op: 0.5082964897155762 secondsTime to load utils op: 0.5031459331512451 secondsTime to load utils op: 0.5113062858581543 seconds - - - -Time to load utils op: 0.5938823223114014 seconds -Time to load utils op: 0.5933871269226074 seconds -Time to load utils op: 0.5930659770965576 secondsTime to load utils op: 0.593165397644043 seconds - -Loading extension module utils... -Time to load utils op: 0.5030028820037842 seconds -Time to load utils op: 0.6013603210449219 seconds -Time to load utils op: 0.598003625869751 seconds -Time to load utils op: 0.6030025482177734 seconds -Time to load utils op: 0.5983302593231201 seconds -Time to load utils op: 0.5979540348052979 seconds -Time to load utils op: 0.5967209339141846 seconds -Time to load utils op: 0.6059751510620117 seconds -Time to load utils op: 0.6027400493621826 secondsTime to load utils op: 0.6023035049438477 secondsTime to load utils op: 0.6044259071350098 seconds - - -Time to load utils op: 0.5974514484405518 secondsTime to load utils op: 0.6043374538421631 secondsTime to load utils op: 0.5965936183929443 secondsTime to load utils op: 0.5959877967834473 seconds - - - -Time to load utils op: 0.5969858169555664 seconds -Time to load utils op: 0.5993976593017578 seconds -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0030868053436279297 seconds -Time to load utils op: 0.0030670166015625 seconds -Time to load utils op: 0.002970457077026367 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012366771697998047 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012478828430175781 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010709762573242188 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010421276092529297 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013573169708251953 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011115074157714844 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011703968048095703 seconds -Time to load utils op: 0.0010273456573486328 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013704299926757812 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010759830474853516 secondsTime to load utils op: 0.0011408329010009766 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.001008749008178711 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0014812946319580078 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010399818420410156 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012483596801757812 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011110305786132812 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0014259815216064453 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001458883285522461 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013685226440429688 seconds -Loading extension module utils... -Time to load utils op: 0.0010097026824951172 seconds -Time to load utils op: 0.0014271736145019531 secondsTime to load utils op: 0.001430511474609375 seconds - -Loading extension module utils... -Time to load utils op: 0.0013701915740966797 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010380744934082031 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010242462158203125 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0013437271118164062 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012450218200683594 seconds -Loading extension module utils... -Time to load utils op: 0.0012607574462890625 seconds -Time to load utils op: 0.0013217926025390625 seconds -Time to load utils op: 0.0012392997741699219 seconds -Time to load utils op: 0.0010876655578613281 seconds -Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Time to load utils op: 0.0014829635620117188 secondsTime to load utils op: 0.0013508796691894531 seconds - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009799003601074219 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.0010783672332763672 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0016160011291503906 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001218557357788086 seconds -Time to load utils op: 0.0011034011840820312 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.00098419189453125 seconds -Loading extension module utils... -Time to load utils op: 0.0010030269622802734 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013554096221923828 seconds -Time to load utils op: 0.0011508464813232422 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010292530059814453 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012822151184082031 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.0009834766387939453 seconds -Time to load utils op: 0.0010530948638916016 seconds -Time to load utils op: 0.0009746551513671875 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011951923370361328 seconds -Time to load utils op: 0.0011134147644042969 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0010509490966796875 seconds -Time to load utils op: 0.0011789798736572266 seconds -Time to load utils op: 0.0011029243469238281 seconds -Time to load utils op: 0.0013959407806396484 seconds -Time to load utils op: 0.0011692047119140625 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.00110626220703125 seconds -Time to load utils op: 0.0013129711151123047 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010309219360351562 seconds -Time to load utils op: 0.0010178089141845703 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0010225772857666016 seconds -Time to load utils op: 0.0010528564453125 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010704994201660156 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009970664978027344 seconds -Time to load utils op: 0.0013287067413330078 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012922286987304688 seconds -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012669563293457031 seconds -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011639595031738281 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0014903545379638672 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010209083557128906 seconds -Time to load utils op: 0.0014879703521728516 seconds -Time to load utils op: 0.0009920597076416016 seconds -Loading extension module utils... -Time to load utils op: 0.0010528564453125 seconds -Time to load utils op: 0.0010609626770019531 seconds -Time to load utils op: 0.0011904239654541016 seconds -Time to load utils op: 0.0012602806091308594 seconds -Time to load utils op: 0.0009737014770507812 seconds -Time to load utils op: 0.0010633468627929688 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.0011038780212402344 seconds -Time to load utils op: 0.0010619163513183594 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001302957534790039 seconds -Time to load utils op: 0.0009949207305908203 seconds -Time to load utils op: 0.0010454654693603516 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001356363296508789 seconds -Loading extension module utils... -Time to load utils op: 0.0013298988342285156 secondsTime to load utils op: 0.0013303756713867188 seconds - -Time to load utils op: 0.0014760494232177734 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012218952178955078 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013172626495361328 secondsTime to load utils op: 0.0014400482177734375 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012531280517578125 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.00106048583984375 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010123252868652344 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.000978708267211914 seconds -Time to load utils op: 0.0012543201446533203 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012726783752441406 seconds -Time to load utils op: 0.0012378692626953125 seconds -Time to load utils op: 0.0011463165283203125 seconds -Time to load utils op: 0.0010967254638671875 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010385513305664062 seconds -Time to load utils op: 0.0009951591491699219 seconds -Time to load utils op: 0.0011768341064453125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013821125030517578 seconds -Time to load utils op: 0.0021505355834960938 seconds -Time to load utils op: 0.0012524127960205078 seconds -Time to load utils op: 0.0012385845184326172 seconds -Time to load utils op: 0.0014538764953613281 seconds -Time to load utils op: 0.0012483596801757812 seconds -Time to load utils op: 0.002208709716796875 seconds -Time to load utils op: 0.0010342597961425781 seconds -Time to load utils op: 0.0024788379669189453 seconds -Time to load utils op: 0.0026793479919433594 seconds -Time to load utils op: 0.0010151863098144531 seconds -[2021-10-22 19:42:10,333] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -Time to load utils op: 0.0010538101196289062 seconds -Time to load utils op: 0.002161741256713867 seconds -Time to load utils op: 0.002041339874267578 seconds -Time to load utils op: 0.0019195079803466797 seconds -Time to load utils op: 0.002001047134399414 seconds -[2021-10-22 19:42:10,333] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -[2021-10-22 19:42:10,334] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.86 GB, percent = 20.8% -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012104511260986328 seconds -Time to load utils op: 0.0013842582702636719 seconds -Time to load utils op: 0.0013301372528076172 seconds -Time to load utils op: 0.0013687610626220703 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012104511260986328 seconds -Time to load utils op: 0.0013687610626220703 seconds -Time to load utils op: 0.0012197494506835938 seconds -Time to load utils op: 0.0011599063873291016 seconds -[2021-10-22 19:42:10,379] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-22 19:42:10,380] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-22 19:42:10,380] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.86 GB, percent = 20.8% -[2021-10-22 19:42:10,380] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-22 19:42:10,408] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-22 19:42:10,409] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-22 19:42:10,409] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 38.86 GB, percent = 20.8% -[2021-10-22 19:42:10,409] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-22 19:42:10,409] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-22 19:42:10,410] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-22 19:42:10,410] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-22 19:42:10,410] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-22 19:42:10,410] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-22 19:42:10,411] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-22 19:42:10,412] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-22 19:42:10,412] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0008044242858886719 seconds -[2021-10-22 19:42:10,413] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,723] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,821] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,822] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,823] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-22 19:42:10,824] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -WARNING: could not find the metadata file /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - will not load any checkpoints and will start from random -time (ms) | load-checkpoint: 7.63 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 - - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504estimated model parameters: 125.2213504 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432estimated model parameters: 125.22432 - -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-22 19:42:10 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 3000320 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.144066 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.255 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.216 seconds - total number of samples: 6927161 - total number of epochs: 1 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.048 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-22 19:42:17 -done with setup ... -training ... -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 125.2213504 billionNumber of parameters: 125.2213504 billionNumber of parameters: 125.2213504 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -time (ms) | model-and-optimizer-setup: 5383.75 | train/valid/test-data-iterators-setup: 5544.86 -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 125.22432 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.368064 billion -[before the start of training step] datetime: 2021-10-22 19:42:17 -[2021-10-22 19:42:17,113] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-22 19:42:17,114] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-22 19:42:17,114] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-22 19:42:17,114] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-22 19:42:17,114] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -[Rank 3] (after 1 iterations) memory (MB) | allocated: 13202.98291015625 | max allocated: 20666.53173828125 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 7] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 11] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 1] (after 1 iterations) memory (MB) | allocated: 13203.21533203125 | max allocated: 20666.76416015625 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 5] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 9] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 13] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 17] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 125] (after 1 iterations) memory (MB) | allocated: 13082.482421875 | max allocated: 20546.08837890625 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 21] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 25] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 29] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 33] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 37] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 41] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 45] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 49] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 53] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 57] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 61] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 65] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 73] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 77] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 69] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 81] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 85] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 19] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 97] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 89] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 93] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 101] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 109] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 -[Rank 105] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 15] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 113] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 117] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 121] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 6] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 2] (after 1 iterations) memory (MB) | allocated: 13201.28759765625 | max allocated: 20664.83642578125 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 27] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 14] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 35] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 10] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 31] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 126] (after 1 iterations) memory (MB) | allocated: 13082.38818359375 | max allocated: 20545.994140625 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 18] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 39] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 4] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 0] (after 1 iterations) memory (MB) | allocated: 13201.60791015625 | max allocated: 20665.15673828125 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 43] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 23] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 22] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 124] (after 1 iterations) memory (MB) | allocated: 13082.38818359375 | max allocated: 20545.994140625 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 30] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 47] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 26] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 8] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 34] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 12] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 24] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 51] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 59] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 55] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 42] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 38] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 16] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 28] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 32] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 20] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 63] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 46] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 50] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 67] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 40] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 75] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 54] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 44] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 58] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 36] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 79] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 71] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 66] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 62] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 48] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 52] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 60] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 70] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 56] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 83] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 87] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 74] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 64] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 91] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 78] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 72] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 82] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 68] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 86] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 80] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 95] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 76] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 90] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 94] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 84] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 99] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 103] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 98] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 88] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 96] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 111] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 102] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 92] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 107] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0[Rank 106] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 - -[Rank 100] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 110] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 -[Rank 108] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 -[Rank 104] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 114] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 119] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 118] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 116] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 115] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 112] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 123] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 122] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 120] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 - iteration 1/ 292968 | consumed samples: 2048 | consumed tokens: 131072 | elapsed time per iteration (ms): 155343.8 | learning rate: 5.680E-07 | global batch size: 2048 | lm loss: 1.104119E+01 | loss scale: 4096.0 | grad norm: 261416.473 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -[Rank 127] (after 1 iterations) memory (MB) | allocated: 13083.8984375 | max allocated: 20547.50439453125 | reserved: 24406.0 | max reserved: 24406.0 -time (ms) - iteration 2/ 292968 | consumed samples: 4096 | consumed tokens: 262144 | elapsed time per iteration (ms): 89531.2 | learning rate: 1.136E-06 | global batch size: 2048 | lm loss: 1.104001E+01 | loss scale: 4096.0 | grad norm: 262433.480 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3/ 292968 | consumed samples: 6144 | consumed tokens: 393216 | elapsed time per iteration (ms): 90335.9 | learning rate: 1.704E-06 | global batch size: 2048 | lm loss: 1.462783E+01 | loss scale: 4096.0 | grad norm: 1385164.876 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 4/ 292968 | consumed samples: 8192 | consumed tokens: 524288 | elapsed time per iteration (ms): 90865.1 | learning rate: 2.272E-06 | global batch size: 2048 | lm loss: 1.222460E+01 | loss scale: 4096.0 | grad norm: 1035875.605 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 5/ 292968 | consumed samples: 10240 | consumed tokens: 655360 | elapsed time per iteration (ms): 88228.7 | learning rate: 2.840E-06 | global batch size: 2048 | lm loss: 1.105129E+01 | loss scale: 4096.0 | grad norm: 109843.555 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 6/ 292968 | consumed samples: 12288 | consumed tokens: 786432 | elapsed time per iteration (ms): 90364.0 | learning rate: 3.408E-06 | global batch size: 2048 | lm loss: 1.302851E+01 | loss scale: 4096.0 | grad norm: 504762.354 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 7/ 292968 | consumed samples: 14336 | consumed tokens: 917504 | elapsed time per iteration (ms): 89525.8 | learning rate: 3.976E-06 | global batch size: 2048 | lm loss: 1.269341E+01 | loss scale: 4096.0 | grad norm: 531716.693 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 8/ 292968 | consumed samples: 16384 | consumed tokens: 1048576 | elapsed time per iteration (ms): 89631.2 | learning rate: 4.544E-06 | global batch size: 2048 | lm loss: 1.177836E+01 | loss scale: 4096.0 | grad norm: 53795.591 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 9/ 292968 | consumed samples: 18432 | consumed tokens: 1179648 | elapsed time per iteration (ms): 88962.1 | learning rate: 5.112E-06 | global batch size: 2048 | lm loss: 1.117707E+01 | loss scale: 4096.0 | grad norm: 42672.353 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 10/ 292968 | consumed samples: 20480 | consumed tokens: 1310720 | elapsed time per iteration (ms): 90753.3 | learning rate: 5.680E-06 | global batch size: 2048 | lm loss: 1.033078E+01 | loss scale: 4096.0 | grad norm: 35450.105 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 11/ 292968 | consumed samples: 22528 | consumed tokens: 1441792 | elapsed time per iteration (ms): 96012.6 | learning rate: 6.249E-06 | global batch size: 2048 | lm loss: 1.006670E+01 | loss scale: 4096.0 | grad norm: 173306.280 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 12/ 292968 | consumed samples: 24576 | consumed tokens: 1572864 | elapsed time per iteration (ms): 88995.1 | learning rate: 6.817E-06 | global batch size: 2048 | lm loss: 1.013344E+01 | loss scale: 4096.0 | grad norm: 289208.468 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 13/ 292968 | consumed samples: 26624 | consumed tokens: 1703936 | elapsed time per iteration (ms): 88746.8 | learning rate: 7.385E-06 | global batch size: 2048 | lm loss: 9.343867E+00 | loss scale: 4096.0 | grad norm: 124547.105 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 14/ 292968 | consumed samples: 28672 | consumed tokens: 1835008 | elapsed time per iteration (ms): 87326.8 | learning rate: 7.953E-06 | global batch size: 2048 | lm loss: 9.136629E+00 | loss scale: 4096.0 | grad norm: 65358.765 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 15/ 292968 | consumed samples: 30720 | consumed tokens: 1966080 | elapsed time per iteration (ms): 99598.2 | learning rate: 8.521E-06 | global batch size: 2048 | lm loss: 8.896122E+00 | loss scale: 4096.0 | grad norm: 33640.726 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 16/ 292968 | consumed samples: 32768 | consumed tokens: 2097152 | elapsed time per iteration (ms): 112821.1 | learning rate: 9.089E-06 | global batch size: 2048 | lm loss: 8.753995E+00 | loss scale: 4096.0 | grad norm: 26272.826 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 17/ 292968 | consumed samples: 34816 | consumed tokens: 2228224 | elapsed time per iteration (ms): 113171.8 | learning rate: 9.657E-06 | global batch size: 2048 | lm loss: 8.644328E+00 | loss scale: 4096.0 | grad norm: 28987.568 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 18/ 292968 | consumed samples: 36864 | consumed tokens: 2359296 | elapsed time per iteration (ms): 92106.5 | learning rate: 1.022E-05 | global batch size: 2048 | lm loss: 8.528214E+00 | loss scale: 4096.0 | grad norm: 35684.095 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 19/ 292968 | consumed samples: 38912 | consumed tokens: 2490368 | elapsed time per iteration (ms): 89015.0 | learning rate: 1.079E-05 | global batch size: 2048 | lm loss: 8.372327E+00 | loss scale: 4096.0 | grad norm: 38456.795 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 20/ 292968 | consumed samples: 40960 | consumed tokens: 2621440 | elapsed time per iteration (ms): 91951.6 | learning rate: 1.136E-05 | global batch size: 2048 | lm loss: 8.355244E+00 | loss scale: 4096.0 | grad norm: 43872.088 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 21/ 292968 | consumed samples: 43008 | consumed tokens: 2752512 | elapsed time per iteration (ms): 95701.0 | learning rate: 1.193E-05 | global batch size: 2048 | lm loss: 8.362148E+00 | loss scale: 4096.0 | grad norm: 70716.750 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 22/ 292968 | consumed samples: 45056 | consumed tokens: 2883584 | elapsed time per iteration (ms): 92107.3 | learning rate: 1.250E-05 | global batch size: 2048 | lm loss: 8.278668E+00 | loss scale: 4096.0 | grad norm: 59801.834 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 23/ 292968 | consumed samples: 47104 | consumed tokens: 3014656 | elapsed time per iteration (ms): 90908.8 | learning rate: 1.307E-05 | global batch size: 2048 | lm loss: 8.146460E+00 | loss scale: 4096.0 | grad norm: 18576.409 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 24/ 292968 | consumed samples: 49152 | consumed tokens: 3145728 | elapsed time per iteration (ms): 88708.2 | learning rate: 1.363E-05 | global batch size: 2048 | lm loss: 8.119708E+00 | loss scale: 4096.0 | grad norm: 20643.527 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 25/ 292968 | consumed samples: 51200 | consumed tokens: 3276800 | elapsed time per iteration (ms): 90029.8 | learning rate: 1.420E-05 | global batch size: 2048 | lm loss: 8.030657E+00 | loss scale: 4096.0 | grad norm: 20426.325 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 26/ 292968 | consumed samples: 53248 | consumed tokens: 3407872 | elapsed time per iteration (ms): 88060.6 | learning rate: 1.477E-05 | global batch size: 2048 | lm loss: 7.992906E+00 | loss scale: 4096.0 | grad norm: 18450.042 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 27/ 292968 | consumed samples: 55296 | consumed tokens: 3538944 | elapsed time per iteration (ms): 87576.5 | learning rate: 1.534E-05 | global batch size: 2048 | lm loss: 7.913804E+00 | loss scale: 4096.0 | grad norm: 15801.076 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 28/ 292968 | consumed samples: 57344 | consumed tokens: 3670016 | elapsed time per iteration (ms): 88054.6 | learning rate: 1.591E-05 | global batch size: 2048 | lm loss: 7.892510E+00 | loss scale: 4096.0 | grad norm: 20085.329 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 29/ 292968 | consumed samples: 59392 | consumed tokens: 3801088 | elapsed time per iteration (ms): 88346.9 | learning rate: 1.647E-05 | global batch size: 2048 | lm loss: 7.848950E+00 | loss scale: 4096.0 | grad norm: 18661.056 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 30/ 292968 | consumed samples: 61440 | consumed tokens: 3932160 | elapsed time per iteration (ms): 87848.3 | learning rate: 1.704E-05 | global batch size: 2048 | lm loss: 7.834585E+00 | loss scale: 4096.0 | grad norm: 17634.073 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 31/ 292968 | consumed samples: 63488 | consumed tokens: 4063232 | elapsed time per iteration (ms): 86855.8 | learning rate: 1.761E-05 | global batch size: 2048 | lm loss: 7.774508E+00 | loss scale: 4096.0 | grad norm: 13680.334 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 32/ 292968 | consumed samples: 65536 | consumed tokens: 4194304 | elapsed time per iteration (ms): 87535.9 | learning rate: 1.818E-05 | global batch size: 2048 | lm loss: 7.786371E+00 | loss scale: 4096.0 | grad norm: 13901.015 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 33/ 292968 | consumed samples: 67584 | consumed tokens: 4325376 | elapsed time per iteration (ms): 89116.5 | learning rate: 1.875E-05 | global batch size: 2048 | lm loss: 7.777013E+00 | loss scale: 4096.0 | grad norm: 12165.550 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 34/ 292968 | consumed samples: 69632 | consumed tokens: 4456448 | elapsed time per iteration (ms): 86951.7 | learning rate: 1.931E-05 | global batch size: 2048 | lm loss: 7.754364E+00 | loss scale: 4096.0 | grad norm: 9428.975 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 35/ 292968 | consumed samples: 71680 | consumed tokens: 4587520 | elapsed time per iteration (ms): 86340.0 | learning rate: 1.988E-05 | global batch size: 2048 | lm loss: 7.751292E+00 | loss scale: 4096.0 | grad norm: 13138.732 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 36/ 292968 | consumed samples: 73728 | consumed tokens: 4718592 | elapsed time per iteration (ms): 88749.5 | learning rate: 2.045E-05 | global batch size: 2048 | lm loss: 7.721442E+00 | loss scale: 4096.0 | grad norm: 11052.509 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 37/ 292968 | consumed samples: 75776 | consumed tokens: 4849664 | elapsed time per iteration (ms): 94242.0 | learning rate: 2.102E-05 | global batch size: 2048 | lm loss: 7.775472E+00 | loss scale: 4096.0 | grad norm: 10223.253 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 38/ 292968 | consumed samples: 77824 | consumed tokens: 4980736 | elapsed time per iteration (ms): 95257.8 | learning rate: 2.159E-05 | global batch size: 2048 | lm loss: 7.726554E+00 | loss scale: 4096.0 | grad norm: 6347.728 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 39/ 292968 | consumed samples: 79872 | consumed tokens: 5111808 | elapsed time per iteration (ms): 88167.5 | learning rate: 2.215E-05 | global batch size: 2048 | lm loss: 7.776631E+00 | loss scale: 4096.0 | grad norm: 11502.365 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 40/ 292968 | consumed samples: 81920 | consumed tokens: 5242880 | elapsed time per iteration (ms): 88292.1 | learning rate: 2.272E-05 | global batch size: 2048 | lm loss: 7.735412E+00 | loss scale: 4096.0 | grad norm: 10785.585 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 41/ 292968 | consumed samples: 83968 | consumed tokens: 5373952 | elapsed time per iteration (ms): 99634.6 | learning rate: 2.329E-05 | global batch size: 2048 | lm loss: 7.727369E+00 | loss scale: 4096.0 | grad norm: 8036.944 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 42/ 292968 | consumed samples: 86016 | consumed tokens: 5505024 | elapsed time per iteration (ms): 109316.4 | learning rate: 2.386E-05 | global batch size: 2048 | lm loss: 7.740176E+00 | loss scale: 4096.0 | grad norm: 12550.111 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 43/ 292968 | consumed samples: 88064 | consumed tokens: 5636096 | elapsed time per iteration (ms): 112497.3 | learning rate: 2.443E-05 | global batch size: 2048 | lm loss: 7.733941E+00 | loss scale: 4096.0 | grad norm: 9284.266 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 44/ 292968 | consumed samples: 90112 | consumed tokens: 5767168 | elapsed time per iteration (ms): 94979.4 | learning rate: 2.499E-05 | global batch size: 2048 | lm loss: 7.754740E+00 | loss scale: 4096.0 | grad norm: 13500.069 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 45/ 292968 | consumed samples: 92160 | consumed tokens: 5898240 | elapsed time per iteration (ms): 92686.1 | learning rate: 2.556E-05 | global batch size: 2048 | lm loss: 7.735516E+00 | loss scale: 4096.0 | grad norm: 15006.510 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 46/ 292968 | consumed samples: 94208 | consumed tokens: 6029312 | elapsed time per iteration (ms): 89167.6 | learning rate: 2.613E-05 | global batch size: 2048 | lm loss: 7.742296E+00 | loss scale: 4096.0 | grad norm: 11202.084 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 47/ 292968 | consumed samples: 96256 | consumed tokens: 6160384 | elapsed time per iteration (ms): 88271.8 | learning rate: 2.670E-05 | global batch size: 2048 | lm loss: 7.727777E+00 | loss scale: 4096.0 | grad norm: 16551.779 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 48/ 292968 | consumed samples: 98304 | consumed tokens: 6291456 | elapsed time per iteration (ms): 87067.5 | learning rate: 2.727E-05 | global batch size: 2048 | lm loss: 7.734728E+00 | loss scale: 4096.0 | grad norm: 9922.676 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 49/ 292968 | consumed samples: 100352 | consumed tokens: 6422528 | elapsed time per iteration (ms): 88520.0 | learning rate: 2.783E-05 | global batch size: 2048 | lm loss: 7.768594E+00 | loss scale: 4096.0 | grad norm: 33877.603 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 50/ 292968 | consumed samples: 102400 | consumed tokens: 6553600 | elapsed time per iteration (ms): 86390.2 | learning rate: 2.840E-05 | global batch size: 2048 | lm loss: 7.752273E+00 | loss scale: 4096.0 | grad norm: 15884.898 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 51/ 292968 | consumed samples: 104448 | consumed tokens: 6684672 | elapsed time per iteration (ms): 88902.5 | learning rate: 2.897E-05 | global batch size: 2048 | lm loss: 8.348561E+00 | loss scale: 4096.0 | grad norm: 108304.793 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 52/ 292968 | consumed samples: 106496 | consumed tokens: 6815744 | elapsed time per iteration (ms): 87110.8 | learning rate: 2.954E-05 | global batch size: 2048 | lm loss: 8.134525E+00 | loss scale: 4096.0 | grad norm: 53171.887 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 53/ 292968 | consumed samples: 108544 | consumed tokens: 6946816 | elapsed time per iteration (ms): 88062.0 | learning rate: 3.011E-05 | global batch size: 2048 | lm loss: 8.449836E+00 | loss scale: 4096.0 | grad norm: 31357.259 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 54/ 292968 | consumed samples: 110592 | consumed tokens: 7077888 | elapsed time per iteration (ms): 87731.9 | learning rate: 3.067E-05 | global batch size: 2048 | lm loss: 8.427136E+00 | loss scale: 4096.0 | grad norm: 28965.925 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 55/ 292968 | consumed samples: 112640 | consumed tokens: 7208960 | elapsed time per iteration (ms): 83999.6 | learning rate: 3.124E-05 | global batch size: 2048 | lm loss: 8.305291E+00 | loss scale: 4096.0 | grad norm: 59085.476 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 56/ 292968 | consumed samples: 114688 | consumed tokens: 7340032 | elapsed time per iteration (ms): 85632.3 | learning rate: 3.181E-05 | global batch size: 2048 | lm loss: 8.021071E+00 | loss scale: 4096.0 | grad norm: 38109.304 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 57/ 292968 | consumed samples: 116736 | consumed tokens: 7471104 | elapsed time per iteration (ms): 85262.0 | learning rate: 3.238E-05 | global batch size: 2048 | lm loss: 7.994979E+00 | loss scale: 4096.0 | grad norm: 84266.593 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 58/ 292968 | consumed samples: 118784 | consumed tokens: 7602176 | elapsed time per iteration (ms): 86089.9 | learning rate: 3.295E-05 | global batch size: 2048 | lm loss: 8.005114E+00 | loss scale: 4096.0 | grad norm: 82354.178 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 59/ 292968 | consumed samples: 120832 | consumed tokens: 7733248 | elapsed time per iteration (ms): 87514.8 | learning rate: 3.351E-05 | global batch size: 2048 | lm loss: 8.163286E+00 | loss scale: 4096.0 | grad norm: 143866.369 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 60/ 292968 | consumed samples: 122880 | consumed tokens: 7864320 | elapsed time per iteration (ms): 86696.7 | learning rate: 3.408E-05 | global batch size: 2048 | lm loss: 8.117870E+00 | loss scale: 4096.0 | grad norm: 87305.550 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 61/ 292968 | consumed samples: 124928 | consumed tokens: 7995392 | elapsed time per iteration (ms): 86123.6 | learning rate: 3.465E-05 | global batch size: 2048 | lm loss: 8.063112E+00 | loss scale: 4096.0 | grad norm: 43178.466 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 62/ 292968 | consumed samples: 126976 | consumed tokens: 8126464 | elapsed time per iteration (ms): 85391.9 | learning rate: 3.522E-05 | global batch size: 2048 | lm loss: 8.054396E+00 | loss scale: 4096.0 | grad norm: 29089.157 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 63/ 292968 | consumed samples: 129024 | consumed tokens: 8257536 | elapsed time per iteration (ms): 86010.1 | learning rate: 3.579E-05 | global batch size: 2048 | lm loss: 7.942375E+00 | loss scale: 4096.0 | grad norm: 26496.302 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 64/ 292968 | consumed samples: 131072 | consumed tokens: 8388608 | elapsed time per iteration (ms): 89734.9 | learning rate: 3.636E-05 | global batch size: 2048 | lm loss: 7.955458E+00 | loss scale: 4096.0 | grad norm: 88339.485 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 65/ 292968 | consumed samples: 133120 | consumed tokens: 8519680 | elapsed time per iteration (ms): 90962.1 | learning rate: 3.692E-05 | global batch size: 2048 | lm loss: 7.991998E+00 | loss scale: 4096.0 | grad norm: 99841.120 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 66/ 292968 | consumed samples: 135168 | consumed tokens: 8650752 | elapsed time per iteration (ms): 87424.2 | learning rate: 3.749E-05 | global batch size: 2048 | lm loss: 7.995114E+00 | loss scale: 4096.0 | grad norm: 118350.933 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 67/ 292968 | consumed samples: 137216 | consumed tokens: 8781824 | elapsed time per iteration (ms): 86361.8 | learning rate: 3.806E-05 | global batch size: 2048 | lm loss: 7.883905E+00 | loss scale: 4096.0 | grad norm: 59370.819 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 68/ 292968 | consumed samples: 139264 | consumed tokens: 8912896 | elapsed time per iteration (ms): 95061.6 | learning rate: 3.863E-05 | global batch size: 2048 | lm loss: 7.887863E+00 | loss scale: 4096.0 | grad norm: 60138.768 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 69/ 292968 | consumed samples: 141312 | consumed tokens: 9043968 | elapsed time per iteration (ms): 96896.8 | learning rate: 3.920E-05 | global batch size: 2048 | lm loss: 7.847830E+00 | loss scale: 4096.0 | grad norm: 25277.613 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 70/ 292968 | consumed samples: 143360 | consumed tokens: 9175040 | elapsed time per iteration (ms): 103174.5 | learning rate: 3.976E-05 | global batch size: 2048 | lm loss: 7.808884E+00 | loss scale: 4096.0 | grad norm: 24361.871 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 71/ 292968 | consumed samples: 145408 | consumed tokens: 9306112 | elapsed time per iteration (ms): 95524.5 | learning rate: 4.033E-05 | global batch size: 2048 | lm loss: 7.758329E+00 | loss scale: 4096.0 | grad norm: 28364.339 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 72/ 292968 | consumed samples: 147456 | consumed tokens: 9437184 | elapsed time per iteration (ms): 86777.0 | learning rate: 4.090E-05 | global batch size: 2048 | lm loss: 7.820934E+00 | loss scale: 4096.0 | grad norm: 59989.165 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 73/ 292968 | consumed samples: 149504 | consumed tokens: 9568256 | elapsed time per iteration (ms): 86374.6 | learning rate: 4.147E-05 | global batch size: 2048 | lm loss: 7.833698E+00 | loss scale: 4096.0 | grad norm: 77920.790 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 74/ 292968 | consumed samples: 151552 | consumed tokens: 9699328 | elapsed time per iteration (ms): 86434.0 | learning rate: 4.204E-05 | global batch size: 2048 | lm loss: 7.717345E+00 | loss scale: 4096.0 | grad norm: 25247.613 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 75/ 292968 | consumed samples: 153600 | consumed tokens: 9830400 | elapsed time per iteration (ms): 84888.0 | learning rate: 4.260E-05 | global batch size: 2048 | lm loss: 7.728312E+00 | loss scale: 4096.0 | grad norm: 24863.995 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 76/ 292968 | consumed samples: 155648 | consumed tokens: 9961472 | elapsed time per iteration (ms): 85053.7 | learning rate: 4.317E-05 | global batch size: 2048 | lm loss: 7.708974E+00 | loss scale: 4096.0 | grad norm: 22405.252 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 77/ 292968 | consumed samples: 157696 | consumed tokens: 10092544 | elapsed time per iteration (ms): 84969.8 | learning rate: 4.374E-05 | global batch size: 2048 | lm loss: 7.701325E+00 | loss scale: 4096.0 | grad norm: 24456.465 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 78/ 292968 | consumed samples: 159744 | consumed tokens: 10223616 | elapsed time per iteration (ms): 86217.4 | learning rate: 4.431E-05 | global batch size: 2048 | lm loss: 7.657438E+00 | loss scale: 4096.0 | grad norm: 20716.094 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 79/ 292968 | consumed samples: 161792 | consumed tokens: 10354688 | elapsed time per iteration (ms): 85707.8 | learning rate: 4.488E-05 | global batch size: 2048 | lm loss: 7.701501E+00 | loss scale: 4096.0 | grad norm: 46133.150 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 80/ 292968 | consumed samples: 163840 | consumed tokens: 10485760 | elapsed time per iteration (ms): 84714.3 | learning rate: 4.544E-05 | global batch size: 2048 | lm loss: 7.728194E+00 | loss scale: 4096.0 | grad norm: 52455.841 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 81/ 292968 | consumed samples: 165888 | consumed tokens: 10616832 | elapsed time per iteration (ms): 86720.2 | learning rate: 4.601E-05 | global batch size: 2048 | lm loss: 7.663990E+00 | loss scale: 4096.0 | grad norm: 16781.465 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 82/ 292968 | consumed samples: 167936 | consumed tokens: 10747904 | elapsed time per iteration (ms): 85462.3 | learning rate: 4.658E-05 | global batch size: 2048 | lm loss: 7.625393E+00 | loss scale: 4096.0 | grad norm: 16494.139 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 83/ 292968 | consumed samples: 169984 | consumed tokens: 10878976 | elapsed time per iteration (ms): 87485.0 | learning rate: 4.715E-05 | global batch size: 2048 | lm loss: 7.681896E+00 | loss scale: 4096.0 | grad norm: 27727.502 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 84/ 292968 | consumed samples: 172032 | consumed tokens: 11010048 | elapsed time per iteration (ms): 86170.4 | learning rate: 4.772E-05 | global batch size: 2048 | lm loss: 7.651110E+00 | loss scale: 4096.0 | grad norm: 26751.884 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 85/ 292968 | consumed samples: 174080 | consumed tokens: 11141120 | elapsed time per iteration (ms): 85007.4 | learning rate: 4.828E-05 | global batch size: 2048 | lm loss: 7.613363E+00 | loss scale: 4096.0 | grad norm: 24658.672 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 86/ 292968 | consumed samples: 176128 | consumed tokens: 11272192 | elapsed time per iteration (ms): 85388.1 | learning rate: 4.885E-05 | global batch size: 2048 | lm loss: 7.588942E+00 | loss scale: 4096.0 | grad norm: 17595.942 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 87/ 292968 | consumed samples: 178176 | consumed tokens: 11403264 | elapsed time per iteration (ms): 85526.1 | learning rate: 4.942E-05 | global batch size: 2048 | lm loss: 7.615811E+00 | loss scale: 4096.0 | grad norm: 38697.423 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 88/ 292968 | consumed samples: 180224 | consumed tokens: 11534336 | elapsed time per iteration (ms): 85847.1 | learning rate: 4.999E-05 | global batch size: 2048 | lm loss: 7.630613E+00 | loss scale: 4096.0 | grad norm: 21094.672 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 89/ 292968 | consumed samples: 182272 | consumed tokens: 11665408 | elapsed time per iteration (ms): 84451.9 | learning rate: 5.056E-05 | global batch size: 2048 | lm loss: 7.592119E+00 | loss scale: 4096.0 | grad norm: 19528.869 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 90/ 292968 | consumed samples: 184320 | consumed tokens: 11796480 | elapsed time per iteration (ms): 88409.7 | learning rate: 5.112E-05 | global batch size: 2048 | lm loss: 1.217706E+01 | loss scale: 4096.0 | grad norm: 109407.868 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 91/ 292968 | consumed samples: 186368 | consumed tokens: 11927552 | elapsed time per iteration (ms): 89698.1 | learning rate: 5.169E-05 | global batch size: 2048 | lm loss: 1.243414E+01 | loss scale: 4096.0 | grad norm: 91992.234 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 92/ 292968 | consumed samples: 188416 | consumed tokens: 12058624 | elapsed time per iteration (ms): 90069.0 | learning rate: 5.226E-05 | global batch size: 2048 | lm loss: 1.250063E+01 | loss scale: 4096.0 | grad norm: 208949.735 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 93/ 292968 | consumed samples: 190464 | consumed tokens: 12189696 | elapsed time per iteration (ms): 88404.3 | learning rate: 5.283E-05 | global batch size: 2048 | lm loss: 1.076858E+01 | loss scale: 4096.0 | grad norm: 246369.456 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 94/ 292968 | consumed samples: 192512 | consumed tokens: 12320768 | elapsed time per iteration (ms): 87627.4 | learning rate: 5.340E-05 | global batch size: 2048 | lm loss: 1.040920E+01 | loss scale: 4096.0 | grad norm: 1916245.357 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 95/ 292968 | consumed samples: 194560 | consumed tokens: 12451840 | elapsed time per iteration (ms): 88178.3 | learning rate: 5.396E-05 | global batch size: 2048 | lm loss: 1.041481E+01 | loss scale: 4096.0 | grad norm: 1239060.720 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 96/ 292968 | consumed samples: 196608 | consumed tokens: 12582912 | elapsed time per iteration (ms): 94261.9 | learning rate: 5.453E-05 | global batch size: 2048 | lm loss: 1.059174E+01 | loss scale: 4096.0 | grad norm: 82840.232 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 97/ 292968 | consumed samples: 198656 | consumed tokens: 12713984 | elapsed time per iteration (ms): 88673.5 | learning rate: 5.510E-05 | global batch size: 2048 | lm loss: 1.026570E+01 | loss scale: 4096.0 | grad norm: 370187.286 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 98/ 292968 | consumed samples: 200704 | consumed tokens: 12845056 | elapsed time per iteration (ms): 86500.0 | learning rate: 5.567E-05 | global batch size: 2048 | lm loss: 1.006981E+01 | loss scale: 4096.0 | grad norm: 605376.906 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 99/ 292968 | consumed samples: 202752 | consumed tokens: 12976128 | elapsed time per iteration (ms): 86702.8 | learning rate: 5.624E-05 | global batch size: 2048 | lm loss: 9.988615E+00 | loss scale: 4096.0 | grad norm: 83140.616 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 100/ 292968 | consumed samples: 204800 | consumed tokens: 13107200 | elapsed time per iteration (ms): 87413.4 | learning rate: 5.680E-05 | global batch size: 2048 | lm loss: 9.906872E+00 | loss scale: 4096.0 | grad norm: 125443.880 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 101/ 292968 | consumed samples: 206848 | consumed tokens: 13238272 | elapsed time per iteration (ms): 86527.4 | learning rate: 5.737E-05 | global batch size: 2048 | lm loss: 9.554595E+00 | loss scale: 4096.0 | grad norm: 28898.456 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 102/ 292968 | consumed samples: 208896 | consumed tokens: 13369344 | elapsed time per iteration (ms): 84580.4 | learning rate: 5.794E-05 | global batch size: 2048 | lm loss: 9.300461E+00 | loss scale: 4096.0 | grad norm: 44323.452 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 103/ 292968 | consumed samples: 210944 | consumed tokens: 13500416 | elapsed time per iteration (ms): 83831.9 | learning rate: 5.851E-05 | global batch size: 2048 | lm loss: 8.932423E+00 | loss scale: 4096.0 | grad norm: 84600.855 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 104/ 292968 | consumed samples: 212992 | consumed tokens: 13631488 | elapsed time per iteration (ms): 84254.0 | learning rate: 5.908E-05 | global batch size: 2048 | lm loss: 8.679379E+00 | loss scale: 4096.0 | grad norm: 24483.757 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 105/ 292968 | consumed samples: 215040 | consumed tokens: 13762560 | elapsed time per iteration (ms): 84517.3 | learning rate: 5.964E-05 | global batch size: 2048 | lm loss: 8.396422E+00 | loss scale: 4096.0 | grad norm: 50694.781 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 106/ 292968 | consumed samples: 217088 | consumed tokens: 13893632 | elapsed time per iteration (ms): 83544.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.595929E+00 | loss scale: 4096.0 | grad norm: 163149.807 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 107/ 292968 | consumed samples: 219136 | consumed tokens: 14024704 | elapsed time per iteration (ms): 83372.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.214969E+00 | loss scale: 4096.0 | grad norm: 52162.030 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 108/ 292968 | consumed samples: 221184 | consumed tokens: 14155776 | elapsed time per iteration (ms): 84323.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.075233E+00 | loss scale: 4096.0 | grad norm: 29481.182 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 109/ 292968 | consumed samples: 223232 | consumed tokens: 14286848 | elapsed time per iteration (ms): 83802.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.992946E+00 | loss scale: 4096.0 | grad norm: 398062.298 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 110/ 292968 | consumed samples: 225280 | consumed tokens: 14417920 | elapsed time per iteration (ms): 83530.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.015919E+00 | loss scale: 4096.0 | grad norm: 363732.284 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 111/ 292968 | consumed samples: 227328 | consumed tokens: 14548992 | elapsed time per iteration (ms): 82713.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.129687E+00 | loss scale: 4096.0 | grad norm: 2461863.465 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 112/ 292968 | consumed samples: 229376 | consumed tokens: 14680064 | elapsed time per iteration (ms): 84205.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.436904E+00 | loss scale: 4096.0 | grad norm: 183275.470 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 113/ 292968 | consumed samples: 231424 | consumed tokens: 14811136 | elapsed time per iteration (ms): 83468.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.569748E+00 | loss scale: 4096.0 | grad norm: 103778.630 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 114/ 292968 | consumed samples: 233472 | consumed tokens: 14942208 | elapsed time per iteration (ms): 84300.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.405585E+00 | loss scale: 4096.0 | grad norm: 75436.130 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 115/ 292968 | consumed samples: 235520 | consumed tokens: 15073280 | elapsed time per iteration (ms): 82269.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.359320E+00 | loss scale: 4096.0 | grad norm: 27416.456 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 116/ 292968 | consumed samples: 237568 | consumed tokens: 15204352 | elapsed time per iteration (ms): 85449.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.230930E+00 | loss scale: 4096.0 | grad norm: 26721.610 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 117/ 292968 | consumed samples: 239616 | consumed tokens: 15335424 | elapsed time per iteration (ms): 84496.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 8.092511E+00 | loss scale: 4096.0 | grad norm: 17608.072 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 118/ 292968 | consumed samples: 241664 | consumed tokens: 15466496 | elapsed time per iteration (ms): 83433.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.976659E+00 | loss scale: 4096.0 | grad norm: 29611.557 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 119/ 292968 | consumed samples: 243712 | consumed tokens: 15597568 | elapsed time per iteration (ms): 84986.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.977108E+00 | loss scale: 4096.0 | grad norm: 72739.451 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 120/ 292968 | consumed samples: 245760 | consumed tokens: 15728640 | elapsed time per iteration (ms): 84035.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.856696E+00 | loss scale: 4096.0 | grad norm: 38426.664 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 121/ 292968 | consumed samples: 247808 | consumed tokens: 15859712 | elapsed time per iteration (ms): 83850.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.958491E+00 | loss scale: 4096.0 | grad norm: 59412.980 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 122/ 292968 | consumed samples: 249856 | consumed tokens: 15990784 | elapsed time per iteration (ms): 83823.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.979200E+00 | loss scale: 4096.0 | grad norm: 44005.321 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 123/ 292968 | consumed samples: 251904 | consumed tokens: 16121856 | elapsed time per iteration (ms): 84164.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.946018E+00 | loss scale: 4096.0 | grad norm: 19821.032 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 124/ 292968 | consumed samples: 253952 | consumed tokens: 16252928 | elapsed time per iteration (ms): 85380.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.890563E+00 | loss scale: 4096.0 | grad norm: 17517.923 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 125/ 292968 | consumed samples: 256000 | consumed tokens: 16384000 | elapsed time per iteration (ms): 84501.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.804246E+00 | loss scale: 4096.0 | grad norm: 15773.631 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 126/ 292968 | consumed samples: 258048 | consumed tokens: 16515072 | elapsed time per iteration (ms): 82645.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.783193E+00 | loss scale: 4096.0 | grad norm: 24473.277 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 127/ 292968 | consumed samples: 260096 | consumed tokens: 16646144 | elapsed time per iteration (ms): 84285.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.838306E+00 | loss scale: 4096.0 | grad norm: 54289.556 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 128/ 292968 | consumed samples: 262144 | consumed tokens: 16777216 | elapsed time per iteration (ms): 85512.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.875585E+00 | loss scale: 4096.0 | grad norm: 54316.504 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 129/ 292968 | consumed samples: 264192 | consumed tokens: 16908288 | elapsed time per iteration (ms): 82292.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.743047E+00 | loss scale: 4096.0 | grad norm: 15853.527 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 130/ 292968 | consumed samples: 266240 | consumed tokens: 17039360 | elapsed time per iteration (ms): 83756.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.750044E+00 | loss scale: 4096.0 | grad norm: 11782.811 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 131/ 292968 | consumed samples: 268288 | consumed tokens: 17170432 | elapsed time per iteration (ms): 81452.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.693849E+00 | loss scale: 4096.0 | grad norm: 15007.237 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 132/ 292968 | consumed samples: 270336 | consumed tokens: 17301504 | elapsed time per iteration (ms): 83767.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.687657E+00 | loss scale: 4096.0 | grad norm: 14027.855 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 133/ 292968 | consumed samples: 272384 | consumed tokens: 17432576 | elapsed time per iteration (ms): 83051.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.678932E+00 | loss scale: 4096.0 | grad norm: 17580.141 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 134/ 292968 | consumed samples: 274432 | consumed tokens: 17563648 | elapsed time per iteration (ms): 82149.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.698955E+00 | loss scale: 4096.0 | grad norm: 11785.157 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 135/ 292968 | consumed samples: 276480 | consumed tokens: 17694720 | elapsed time per iteration (ms): 82229.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.680353E+00 | loss scale: 4096.0 | grad norm: 16600.023 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 136/ 292968 | consumed samples: 278528 | consumed tokens: 17825792 | elapsed time per iteration (ms): 82686.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.647106E+00 | loss scale: 4096.0 | grad norm: 11050.320 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 137/ 292968 | consumed samples: 280576 | consumed tokens: 17956864 | elapsed time per iteration (ms): 82787.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.596872E+00 | loss scale: 4096.0 | grad norm: 12135.277 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 138/ 292968 | consumed samples: 282624 | consumed tokens: 18087936 | elapsed time per iteration (ms): 83092.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.628756E+00 | loss scale: 4096.0 | grad norm: 17508.768 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 139/ 292968 | consumed samples: 284672 | consumed tokens: 18219008 | elapsed time per iteration (ms): 83077.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.586628E+00 | loss scale: 4096.0 | grad norm: 14450.604 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 140/ 292968 | consumed samples: 286720 | consumed tokens: 18350080 | elapsed time per iteration (ms): 83887.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.597292E+00 | loss scale: 4096.0 | grad norm: 11600.177 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 141/ 292968 | consumed samples: 288768 | consumed tokens: 18481152 | elapsed time per iteration (ms): 83014.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.580558E+00 | loss scale: 4096.0 | grad norm: 9108.881 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 142/ 292968 | consumed samples: 290816 | consumed tokens: 18612224 | elapsed time per iteration (ms): 82477.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.545835E+00 | loss scale: 4096.0 | grad norm: 18359.147 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 143/ 292968 | consumed samples: 292864 | consumed tokens: 18743296 | elapsed time per iteration (ms): 83251.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.556341E+00 | loss scale: 4096.0 | grad norm: 19346.897 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 144/ 292968 | consumed samples: 294912 | consumed tokens: 18874368 | elapsed time per iteration (ms): 83785.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.559230E+00 | loss scale: 4096.0 | grad norm: 15038.131 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 145/ 292968 | consumed samples: 296960 | consumed tokens: 19005440 | elapsed time per iteration (ms): 82829.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.533126E+00 | loss scale: 4096.0 | grad norm: 11829.824 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 146/ 292968 | consumed samples: 299008 | consumed tokens: 19152896 | elapsed time per iteration (ms): 89370.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.556136E+00 | loss scale: 4096.0 | grad norm: 20986.741 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 147/ 292968 | consumed samples: 301056 | consumed tokens: 19300352 | elapsed time per iteration (ms): 90535.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.544194E+00 | loss scale: 4096.0 | grad norm: 18238.409 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 148/ 292968 | consumed samples: 303104 | consumed tokens: 19447808 | elapsed time per iteration (ms): 90984.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.565289E+00 | loss scale: 4096.0 | grad norm: 28307.457 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 149/ 292968 | consumed samples: 305152 | consumed tokens: 19595264 | elapsed time per iteration (ms): 94756.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.524890E+00 | loss scale: 4096.0 | grad norm: 12548.541 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 150/ 292968 | consumed samples: 307200 | consumed tokens: 19742720 | elapsed time per iteration (ms): 92377.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.483114E+00 | loss scale: 4096.0 | grad norm: 13697.514 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 151/ 292968 | consumed samples: 309248 | consumed tokens: 19890176 | elapsed time per iteration (ms): 91251.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.536344E+00 | loss scale: 4096.0 | grad norm: 20026.589 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 152/ 292968 | consumed samples: 311296 | consumed tokens: 20037632 | elapsed time per iteration (ms): 92849.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.532176E+00 | loss scale: 4096.0 | grad norm: 16023.451 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 153/ 292968 | consumed samples: 313344 | consumed tokens: 20185088 | elapsed time per iteration (ms): 94067.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.487831E+00 | loss scale: 4096.0 | grad norm: 21450.698 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 154/ 292968 | consumed samples: 315392 | consumed tokens: 20332544 | elapsed time per iteration (ms): 91359.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.459096E+00 | loss scale: 4096.0 | grad norm: 15661.443 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 155/ 292968 | consumed samples: 317440 | consumed tokens: 20480000 | elapsed time per iteration (ms): 92031.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.455278E+00 | loss scale: 4096.0 | grad norm: 16488.949 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 156/ 292968 | consumed samples: 319488 | consumed tokens: 20627456 | elapsed time per iteration (ms): 92078.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.449800E+00 | loss scale: 4096.0 | grad norm: 16294.586 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 157/ 292968 | consumed samples: 321536 | consumed tokens: 20774912 | elapsed time per iteration (ms): 92324.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.502323E+00 | loss scale: 4096.0 | grad norm: 26629.379 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 158/ 292968 | consumed samples: 323584 | consumed tokens: 20922368 | elapsed time per iteration (ms): 90851.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.476156E+00 | loss scale: 4096.0 | grad norm: 15409.139 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 159/ 292968 | consumed samples: 325632 | consumed tokens: 21069824 | elapsed time per iteration (ms): 92543.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.454834E+00 | loss scale: 4096.0 | grad norm: 16566.363 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 160/ 292968 | consumed samples: 327680 | consumed tokens: 21217280 | elapsed time per iteration (ms): 91407.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.453666E+00 | loss scale: 4096.0 | grad norm: 19858.130 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 161/ 292968 | consumed samples: 329728 | consumed tokens: 21364736 | elapsed time per iteration (ms): 90952.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.438191E+00 | loss scale: 4096.0 | grad norm: 26371.022 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 162/ 292968 | consumed samples: 331776 | consumed tokens: 21512192 | elapsed time per iteration (ms): 91256.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.423912E+00 | loss scale: 4096.0 | grad norm: 15875.077 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 163/ 292968 | consumed samples: 333824 | consumed tokens: 21659648 | elapsed time per iteration (ms): 89347.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.410336E+00 | loss scale: 4096.0 | grad norm: 13237.168 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 164/ 292968 | consumed samples: 335872 | consumed tokens: 21807104 | elapsed time per iteration (ms): 89477.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.422408E+00 | loss scale: 4096.0 | grad norm: 23570.944 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 165/ 292968 | consumed samples: 337920 | consumed tokens: 21954560 | elapsed time per iteration (ms): 92094.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.402050E+00 | loss scale: 4096.0 | grad norm: 17511.089 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 166/ 292968 | consumed samples: 339968 | consumed tokens: 22102016 | elapsed time per iteration (ms): 91807.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.440965E+00 | loss scale: 4096.0 | grad norm: 23039.323 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 167/ 292968 | consumed samples: 342016 | consumed tokens: 22249472 | elapsed time per iteration (ms): 91892.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.429036E+00 | loss scale: 4096.0 | grad norm: 19677.411 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 168/ 292968 | consumed samples: 344064 | consumed tokens: 22396928 | elapsed time per iteration (ms): 90332.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.408422E+00 | loss scale: 4096.0 | grad norm: 19333.799 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 169/ 292968 | consumed samples: 346112 | consumed tokens: 22544384 | elapsed time per iteration (ms): 92031.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.451711E+00 | loss scale: 4096.0 | grad norm: 34113.520 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 170/ 292968 | consumed samples: 348160 | consumed tokens: 22691840 | elapsed time per iteration (ms): 90975.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.439358E+00 | loss scale: 4096.0 | grad norm: 27264.410 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 171/ 292968 | consumed samples: 350208 | consumed tokens: 22839296 | elapsed time per iteration (ms): 91121.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.416613E+00 | loss scale: 4096.0 | grad norm: 29632.702 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 172/ 292968 | consumed samples: 352256 | consumed tokens: 22986752 | elapsed time per iteration (ms): 91798.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.393854E+00 | loss scale: 4096.0 | grad norm: 17631.853 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 173/ 292968 | consumed samples: 354304 | consumed tokens: 23134208 | elapsed time per iteration (ms): 90335.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.378123E+00 | loss scale: 4096.0 | grad norm: 30734.252 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 174/ 292968 | consumed samples: 356352 | consumed tokens: 23281664 | elapsed time per iteration (ms): 92211.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.418646E+00 | loss scale: 4096.0 | grad norm: 42772.780 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 175/ 292968 | consumed samples: 358400 | consumed tokens: 23429120 | elapsed time per iteration (ms): 92730.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.415193E+00 | loss scale: 4096.0 | grad norm: 26586.965 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 176/ 292968 | consumed samples: 360448 | consumed tokens: 23576576 | elapsed time per iteration (ms): 90532.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.432006E+00 | loss scale: 4096.0 | grad norm: 25924.772 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 177/ 292968 | consumed samples: 362496 | consumed tokens: 23724032 | elapsed time per iteration (ms): 94941.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.468750E+00 | loss scale: 4096.0 | grad norm: 51066.459 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 178/ 292968 | consumed samples: 364544 | consumed tokens: 23871488 | elapsed time per iteration (ms): 93385.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.411962E+00 | loss scale: 4096.0 | grad norm: 20014.054 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 179/ 292968 | consumed samples: 366592 | consumed tokens: 24018944 | elapsed time per iteration (ms): 91799.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.407637E+00 | loss scale: 4096.0 | grad norm: 34583.106 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 180/ 292968 | consumed samples: 368640 | consumed tokens: 24166400 | elapsed time per iteration (ms): 90172.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.429335E+00 | loss scale: 4096.0 | grad norm: 44193.318 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 181/ 292968 | consumed samples: 370688 | consumed tokens: 24313856 | elapsed time per iteration (ms): 91357.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.375701E+00 | loss scale: 4096.0 | grad norm: 30485.559 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 182/ 292968 | consumed samples: 372736 | consumed tokens: 24461312 | elapsed time per iteration (ms): 91085.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.345478E+00 | loss scale: 4096.0 | grad norm: 25002.339 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 183/ 292968 | consumed samples: 374784 | consumed tokens: 24608768 | elapsed time per iteration (ms): 93733.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.401399E+00 | loss scale: 4096.0 | grad norm: 25541.577 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 184/ 292968 | consumed samples: 376832 | consumed tokens: 24756224 | elapsed time per iteration (ms): 91528.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.367949E+00 | loss scale: 4096.0 | grad norm: 16743.668 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 185/ 292968 | consumed samples: 378880 | consumed tokens: 24903680 | elapsed time per iteration (ms): 91011.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.366905E+00 | loss scale: 4096.0 | grad norm: 36863.372 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 186/ 292968 | consumed samples: 380928 | consumed tokens: 25051136 | elapsed time per iteration (ms): 90930.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.351393E+00 | loss scale: 4096.0 | grad norm: 25047.798 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 187/ 292968 | consumed samples: 382976 | consumed tokens: 25198592 | elapsed time per iteration (ms): 91557.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.361308E+00 | loss scale: 4096.0 | grad norm: 34015.146 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 188/ 292968 | consumed samples: 385024 | consumed tokens: 25346048 | elapsed time per iteration (ms): 90508.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.377004E+00 | loss scale: 4096.0 | grad norm: 30585.653 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 189/ 292968 | consumed samples: 387072 | consumed tokens: 25493504 | elapsed time per iteration (ms): 90778.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.342936E+00 | loss scale: 4096.0 | grad norm: 16302.708 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 190/ 292968 | consumed samples: 389120 | consumed tokens: 25640960 | elapsed time per iteration (ms): 90067.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.326052E+00 | loss scale: 4096.0 | grad norm: 22075.578 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 191/ 292968 | consumed samples: 391168 | consumed tokens: 25788416 | elapsed time per iteration (ms): 90798.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.417330E+00 | loss scale: 4096.0 | grad norm: 37592.605 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 192/ 292968 | consumed samples: 393216 | consumed tokens: 25935872 | elapsed time per iteration (ms): 91108.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.411121E+00 | loss scale: 4096.0 | grad norm: 31105.301 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 193/ 292968 | consumed samples: 395264 | consumed tokens: 26083328 | elapsed time per iteration (ms): 90598.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.322796E+00 | loss scale: 4096.0 | grad norm: 18106.360 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 194/ 292968 | consumed samples: 397312 | consumed tokens: 26230784 | elapsed time per iteration (ms): 91194.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.334939E+00 | loss scale: 4096.0 | grad norm: 20965.888 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 195/ 292968 | consumed samples: 399360 | consumed tokens: 26378240 | elapsed time per iteration (ms): 92871.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.330479E+00 | loss scale: 4096.0 | grad norm: 23612.456 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 196/ 292968 | consumed samples: 401408 | consumed tokens: 26525696 | elapsed time per iteration (ms): 90169.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.291287E+00 | loss scale: 4096.0 | grad norm: 12967.334 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 197/ 292968 | consumed samples: 403456 | consumed tokens: 26673152 | elapsed time per iteration (ms): 89881.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.331447E+00 | loss scale: 4096.0 | grad norm: 22611.171 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 198/ 292968 | consumed samples: 405504 | consumed tokens: 26820608 | elapsed time per iteration (ms): 89503.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.323497E+00 | loss scale: 4096.0 | grad norm: 22002.151 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 199/ 292968 | consumed samples: 407552 | consumed tokens: 26968064 | elapsed time per iteration (ms): 87665.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.291583E+00 | loss scale: 4096.0 | grad norm: 16687.669 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 200/ 292968 | consumed samples: 409600 | consumed tokens: 27115520 | elapsed time per iteration (ms): 89936.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.281228E+00 | loss scale: 4096.0 | grad norm: 18218.160 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 201/ 292968 | consumed samples: 411648 | consumed tokens: 27262976 | elapsed time per iteration (ms): 90137.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.263555E+00 | loss scale: 4096.0 | grad norm: 17077.486 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 202/ 292968 | consumed samples: 413696 | consumed tokens: 27410432 | elapsed time per iteration (ms): 90442.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.259840E+00 | loss scale: 4096.0 | grad norm: 9457.174 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 203/ 292968 | consumed samples: 415744 | consumed tokens: 27557888 | elapsed time per iteration (ms): 89976.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.304833E+00 | loss scale: 4096.0 | grad norm: 29052.434 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 204/ 292968 | consumed samples: 417792 | consumed tokens: 27705344 | elapsed time per iteration (ms): 90534.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.275797E+00 | loss scale: 4096.0 | grad norm: 23079.934 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 205/ 292968 | consumed samples: 419840 | consumed tokens: 27852800 | elapsed time per iteration (ms): 91185.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.302791E+00 | loss scale: 4096.0 | grad norm: 12181.696 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 206/ 292968 | consumed samples: 421888 | consumed tokens: 28000256 | elapsed time per iteration (ms): 89487.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.280893E+00 | loss scale: 4096.0 | grad norm: 11078.822 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 207/ 292968 | consumed samples: 423936 | consumed tokens: 28147712 | elapsed time per iteration (ms): 90140.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.271828E+00 | loss scale: 4096.0 | grad norm: 17291.990 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 208/ 292968 | consumed samples: 425984 | consumed tokens: 28295168 | elapsed time per iteration (ms): 89816.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.281390E+00 | loss scale: 4096.0 | grad norm: 11414.129 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 209/ 292968 | consumed samples: 428032 | consumed tokens: 28442624 | elapsed time per iteration (ms): 89300.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.262185E+00 | loss scale: 4096.0 | grad norm: 16668.443 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 210/ 292968 | consumed samples: 430080 | consumed tokens: 28590080 | elapsed time per iteration (ms): 89758.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.264831E+00 | loss scale: 4096.0 | grad norm: 11439.927 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 211/ 292968 | consumed samples: 432128 | consumed tokens: 28737536 | elapsed time per iteration (ms): 91769.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.241442E+00 | loss scale: 4096.0 | grad norm: 13925.741 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 212/ 292968 | consumed samples: 434176 | consumed tokens: 28884992 | elapsed time per iteration (ms): 88889.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.260759E+00 | loss scale: 4096.0 | grad norm: 12398.712 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 213/ 292968 | consumed samples: 436224 | consumed tokens: 29032448 | elapsed time per iteration (ms): 88393.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.227284E+00 | loss scale: 4096.0 | grad norm: 10625.202 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 214/ 292968 | consumed samples: 438272 | consumed tokens: 29179904 | elapsed time per iteration (ms): 89775.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.219506E+00 | loss scale: 4096.0 | grad norm: 9982.170 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 215/ 292968 | consumed samples: 440320 | consumed tokens: 29327360 | elapsed time per iteration (ms): 88176.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.223164E+00 | loss scale: 4096.0 | grad norm: 13426.677 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 216/ 292968 | consumed samples: 442368 | consumed tokens: 29474816 | elapsed time per iteration (ms): 87246.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.232490E+00 | loss scale: 4096.0 | grad norm: 10402.551 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 217/ 292968 | consumed samples: 444416 | consumed tokens: 29622272 | elapsed time per iteration (ms): 88799.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.226647E+00 | loss scale: 4096.0 | grad norm: 10730.363 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 218/ 292968 | consumed samples: 446464 | consumed tokens: 29769728 | elapsed time per iteration (ms): 88471.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.226092E+00 | loss scale: 4096.0 | grad norm: 8760.466 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 219/ 292968 | consumed samples: 448512 | consumed tokens: 29917184 | elapsed time per iteration (ms): 87296.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.226552E+00 | loss scale: 4096.0 | grad norm: 9459.818 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 220/ 292968 | consumed samples: 450560 | consumed tokens: 30064640 | elapsed time per iteration (ms): 88141.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.206302E+00 | loss scale: 4096.0 | grad norm: 7831.394 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 221/ 292968 | consumed samples: 452608 | consumed tokens: 30212096 | elapsed time per iteration (ms): 88684.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.232483E+00 | loss scale: 4096.0 | grad norm: 12931.787 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 222/ 292968 | consumed samples: 454656 | consumed tokens: 30359552 | elapsed time per iteration (ms): 89058.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.196499E+00 | loss scale: 4096.0 | grad norm: 6361.027 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 223/ 292968 | consumed samples: 456704 | consumed tokens: 30507008 | elapsed time per iteration (ms): 89746.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.179220E+00 | loss scale: 4096.0 | grad norm: 10442.281 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 224/ 292968 | consumed samples: 458752 | consumed tokens: 30654464 | elapsed time per iteration (ms): 87199.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.207948E+00 | loss scale: 4096.0 | grad norm: 9531.703 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 225/ 292968 | consumed samples: 460800 | consumed tokens: 30801920 | elapsed time per iteration (ms): 87556.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.188715E+00 | loss scale: 4096.0 | grad norm: 7862.797 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 226/ 292968 | consumed samples: 462848 | consumed tokens: 30949376 | elapsed time per iteration (ms): 88501.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.222584E+00 | loss scale: 4096.0 | grad norm: 6611.457 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 227/ 292968 | consumed samples: 464896 | consumed tokens: 31096832 | elapsed time per iteration (ms): 88975.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.199027E+00 | loss scale: 4096.0 | grad norm: 7996.471 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 228/ 292968 | consumed samples: 466944 | consumed tokens: 31244288 | elapsed time per iteration (ms): 89060.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.196391E+00 | loss scale: 4096.0 | grad norm: 7503.172 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 229/ 292968 | consumed samples: 468992 | consumed tokens: 31391744 | elapsed time per iteration (ms): 87689.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.189396E+00 | loss scale: 4096.0 | grad norm: 7376.848 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 230/ 292968 | consumed samples: 471040 | consumed tokens: 31539200 | elapsed time per iteration (ms): 88025.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.244632E+00 | loss scale: 4096.0 | grad norm: 5261.136 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 231/ 292968 | consumed samples: 473088 | consumed tokens: 31686656 | elapsed time per iteration (ms): 86033.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.174881E+00 | loss scale: 4096.0 | grad norm: 8701.154 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 232/ 292968 | consumed samples: 475136 | consumed tokens: 31834112 | elapsed time per iteration (ms): 87319.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.199638E+00 | loss scale: 4096.0 | grad norm: 6819.042 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 233/ 292968 | consumed samples: 477184 | consumed tokens: 31981568 | elapsed time per iteration (ms): 88004.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.181946E+00 | loss scale: 4096.0 | grad norm: 6878.653 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 234/ 292968 | consumed samples: 479232 | consumed tokens: 32129024 | elapsed time per iteration (ms): 85397.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.183530E+00 | loss scale: 4096.0 | grad norm: 6439.746 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 235/ 292968 | consumed samples: 481280 | consumed tokens: 32276480 | elapsed time per iteration (ms): 87334.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.190883E+00 | loss scale: 4096.0 | grad norm: 6277.546 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 236/ 292968 | consumed samples: 483328 | consumed tokens: 32423936 | elapsed time per iteration (ms): 88658.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.195468E+00 | loss scale: 4096.0 | grad norm: 5578.579 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 237/ 292968 | consumed samples: 485376 | consumed tokens: 32571392 | elapsed time per iteration (ms): 87058.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.155280E+00 | loss scale: 4096.0 | grad norm: 4153.910 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 238/ 292968 | consumed samples: 487424 | consumed tokens: 32718848 | elapsed time per iteration (ms): 87528.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.151063E+00 | loss scale: 4096.0 | grad norm: 4058.616 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 239/ 292968 | consumed samples: 489472 | consumed tokens: 32866304 | elapsed time per iteration (ms): 86087.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.184216E+00 | loss scale: 4096.0 | grad norm: 4905.619 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 240/ 292968 | consumed samples: 491520 | consumed tokens: 33013760 | elapsed time per iteration (ms): 86648.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.154154E+00 | loss scale: 4096.0 | grad norm: 3555.795 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 241/ 292968 | consumed samples: 493568 | consumed tokens: 33161216 | elapsed time per iteration (ms): 87397.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.136785E+00 | loss scale: 4096.0 | grad norm: 5871.927 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 242/ 292968 | consumed samples: 495616 | consumed tokens: 33308672 | elapsed time per iteration (ms): 86344.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.172124E+00 | loss scale: 4096.0 | grad norm: 3207.367 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 243/ 292968 | consumed samples: 497664 | consumed tokens: 33456128 | elapsed time per iteration (ms): 86324.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.173362E+00 | loss scale: 4096.0 | grad norm: 4931.001 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 244/ 292968 | consumed samples: 499712 | consumed tokens: 33603584 | elapsed time per iteration (ms): 88210.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.149157E+00 | loss scale: 4096.0 | grad norm: 4066.526 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 245/ 292968 | consumed samples: 501760 | consumed tokens: 33751040 | elapsed time per iteration (ms): 87057.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.158215E+00 | loss scale: 4096.0 | grad norm: 4408.121 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 246/ 292968 | consumed samples: 503808 | consumed tokens: 33898496 | elapsed time per iteration (ms): 86667.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.145041E+00 | loss scale: 4096.0 | grad norm: 4402.722 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 247/ 292968 | consumed samples: 505856 | consumed tokens: 34045952 | elapsed time per iteration (ms): 87844.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.155570E+00 | loss scale: 4096.0 | grad norm: 4267.789 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 248/ 292968 | consumed samples: 507904 | consumed tokens: 34193408 | elapsed time per iteration (ms): 85879.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.156795E+00 | loss scale: 4096.0 | grad norm: 3457.798 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 249/ 292968 | consumed samples: 509952 | consumed tokens: 34340864 | elapsed time per iteration (ms): 85960.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.165915E+00 | loss scale: 4096.0 | grad norm: 3595.937 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 250/ 292968 | consumed samples: 512000 | consumed tokens: 34488320 | elapsed time per iteration (ms): 85594.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.150114E+00 | loss scale: 4096.0 | grad norm: 3544.362 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 251/ 292968 | consumed samples: 514048 | consumed tokens: 34635776 | elapsed time per iteration (ms): 84590.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.132024E+00 | loss scale: 4096.0 | grad norm: 3924.917 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 252/ 292968 | consumed samples: 516096 | consumed tokens: 34783232 | elapsed time per iteration (ms): 83414.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.108165E+00 | loss scale: 4096.0 | grad norm: 2755.817 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 253/ 292968 | consumed samples: 518144 | consumed tokens: 34930688 | elapsed time per iteration (ms): 83645.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.122099E+00 | loss scale: 4096.0 | grad norm: 3453.597 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 254/ 292968 | consumed samples: 520192 | consumed tokens: 35078144 | elapsed time per iteration (ms): 86420.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.117218E+00 | loss scale: 4096.0 | grad norm: 2813.488 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 255/ 292968 | consumed samples: 522240 | consumed tokens: 35225600 | elapsed time per iteration (ms): 85643.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.111281E+00 | loss scale: 4096.0 | grad norm: 3916.570 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 256/ 292968 | consumed samples: 524288 | consumed tokens: 35373056 | elapsed time per iteration (ms): 83003.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.130394E+00 | loss scale: 4096.0 | grad norm: 2624.113 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 257/ 292968 | consumed samples: 526336 | consumed tokens: 35520512 | elapsed time per iteration (ms): 85338.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.111339E+00 | loss scale: 4096.0 | grad norm: 3157.161 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 258/ 292968 | consumed samples: 528384 | consumed tokens: 35667968 | elapsed time per iteration (ms): 84011.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.095218E+00 | loss scale: 4096.0 | grad norm: 2666.346 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 259/ 292968 | consumed samples: 530432 | consumed tokens: 35815424 | elapsed time per iteration (ms): 86144.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.115823E+00 | loss scale: 4096.0 | grad norm: 3143.871 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 260/ 292968 | consumed samples: 532480 | consumed tokens: 35962880 | elapsed time per iteration (ms): 84516.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.104167E+00 | loss scale: 4096.0 | grad norm: 2367.017 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 261/ 292968 | consumed samples: 534528 | consumed tokens: 36110336 | elapsed time per iteration (ms): 85507.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.085538E+00 | loss scale: 4096.0 | grad norm: 3140.141 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 262/ 292968 | consumed samples: 536576 | consumed tokens: 36257792 | elapsed time per iteration (ms): 83825.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.100977E+00 | loss scale: 4096.0 | grad norm: 2888.430 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 263/ 292968 | consumed samples: 538624 | consumed tokens: 36405248 | elapsed time per iteration (ms): 85664.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.116061E+00 | loss scale: 4096.0 | grad norm: 3145.440 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 264/ 292968 | consumed samples: 540672 | consumed tokens: 36552704 | elapsed time per iteration (ms): 85735.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.104592E+00 | loss scale: 4096.0 | grad norm: 3066.935 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 265/ 292968 | consumed samples: 542720 | consumed tokens: 36700160 | elapsed time per iteration (ms): 85344.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.109227E+00 | loss scale: 4096.0 | grad norm: 2960.641 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 266/ 292968 | consumed samples: 544768 | consumed tokens: 36847616 | elapsed time per iteration (ms): 85025.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.105661E+00 | loss scale: 4096.0 | grad norm: 3041.998 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 267/ 292968 | consumed samples: 546816 | consumed tokens: 36995072 | elapsed time per iteration (ms): 85897.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.112930E+00 | loss scale: 4096.0 | grad norm: 3617.269 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 268/ 292968 | consumed samples: 548864 | consumed tokens: 37142528 | elapsed time per iteration (ms): 85348.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.107900E+00 | loss scale: 4096.0 | grad norm: 3257.462 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 269/ 292968 | consumed samples: 550912 | consumed tokens: 37289984 | elapsed time per iteration (ms): 85653.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.091923E+00 | loss scale: 4096.0 | grad norm: 3868.766 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 270/ 292968 | consumed samples: 552960 | consumed tokens: 37437440 | elapsed time per iteration (ms): 86303.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.091510E+00 | loss scale: 4096.0 | grad norm: 2734.039 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 271/ 292968 | consumed samples: 555008 | consumed tokens: 37584896 | elapsed time per iteration (ms): 86370.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.080944E+00 | loss scale: 4096.0 | grad norm: 2489.056 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 272/ 292968 | consumed samples: 557056 | consumed tokens: 37732352 | elapsed time per iteration (ms): 84589.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.104280E+00 | loss scale: 4096.0 | grad norm: 2907.656 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 273/ 292968 | consumed samples: 559104 | consumed tokens: 37879808 | elapsed time per iteration (ms): 84703.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.103776E+00 | loss scale: 4096.0 | grad norm: 1997.705 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 274/ 292968 | consumed samples: 561152 | consumed tokens: 38027264 | elapsed time per iteration (ms): 84472.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.100480E+00 | loss scale: 4096.0 | grad norm: 2917.056 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 275/ 292968 | consumed samples: 563200 | consumed tokens: 38174720 | elapsed time per iteration (ms): 85625.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.112624E+00 | loss scale: 4096.0 | grad norm: 2375.447 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 276/ 292968 | consumed samples: 565248 | consumed tokens: 38322176 | elapsed time per iteration (ms): 85095.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.109922E+00 | loss scale: 4096.0 | grad norm: 2321.919 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 277/ 292968 | consumed samples: 567296 | consumed tokens: 38469632 | elapsed time per iteration (ms): 87476.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.126214E+00 | loss scale: 4096.0 | grad norm: 2500.190 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 278/ 292968 | consumed samples: 569344 | consumed tokens: 38617088 | elapsed time per iteration (ms): 85542.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.100136E+00 | loss scale: 4096.0 | grad norm: 2554.178 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 279/ 292968 | consumed samples: 571392 | consumed tokens: 38764544 | elapsed time per iteration (ms): 86956.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.124643E+00 | loss scale: 4096.0 | grad norm: 2493.901 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 280/ 292968 | consumed samples: 573440 | consumed tokens: 38912000 | elapsed time per iteration (ms): 86596.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.121703E+00 | loss scale: 4096.0 | grad norm: 2227.610 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 281/ 292968 | consumed samples: 575488 | consumed tokens: 39059456 | elapsed time per iteration (ms): 85793.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.120274E+00 | loss scale: 4096.0 | grad norm: 3070.277 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 282/ 292968 | consumed samples: 577536 | consumed tokens: 39206912 | elapsed time per iteration (ms): 85433.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.130398E+00 | loss scale: 4096.0 | grad norm: 2406.911 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 283/ 292968 | consumed samples: 579584 | consumed tokens: 39354368 | elapsed time per iteration (ms): 82910.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.115321E+00 | loss scale: 4096.0 | grad norm: 2714.693 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 284/ 292968 | consumed samples: 581632 | consumed tokens: 39501824 | elapsed time per iteration (ms): 84154.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.143191E+00 | loss scale: 4096.0 | grad norm: 2463.085 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 285/ 292968 | consumed samples: 583680 | consumed tokens: 39649280 | elapsed time per iteration (ms): 83878.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.128511E+00 | loss scale: 4096.0 | grad norm: 3032.257 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 286/ 292968 | consumed samples: 585728 | consumed tokens: 39796736 | elapsed time per iteration (ms): 84510.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.140932E+00 | loss scale: 4096.0 | grad norm: 2642.742 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 287/ 292968 | consumed samples: 587776 | consumed tokens: 39944192 | elapsed time per iteration (ms): 87083.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.125048E+00 | loss scale: 4096.0 | grad norm: 2178.950 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 288/ 292968 | consumed samples: 589824 | consumed tokens: 40091648 | elapsed time per iteration (ms): 86070.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.136089E+00 | loss scale: 4096.0 | grad norm: 2367.513 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 289/ 292968 | consumed samples: 591872 | consumed tokens: 40239104 | elapsed time per iteration (ms): 84871.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.169606E+00 | loss scale: 4096.0 | grad norm: 2471.813 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 290/ 292968 | consumed samples: 593920 | consumed tokens: 40386560 | elapsed time per iteration (ms): 84339.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.182253E+00 | loss scale: 4096.0 | grad norm: 2808.180 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 291/ 292968 | consumed samples: 595968 | consumed tokens: 40550400 | elapsed time per iteration (ms): 85880.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.175693E+00 | loss scale: 4096.0 | grad norm: 3829.514 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 292/ 292968 | consumed samples: 598016 | consumed tokens: 40714240 | elapsed time per iteration (ms): 84544.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.198044E+00 | loss scale: 4096.0 | grad norm: 3918.938 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 293/ 292968 | consumed samples: 600064 | consumed tokens: 40878080 | elapsed time per iteration (ms): 86511.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.216866E+00 | loss scale: 4096.0 | grad norm: 3100.168 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 294/ 292968 | consumed samples: 602112 | consumed tokens: 41041920 | elapsed time per iteration (ms): 83406.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.230723E+00 | loss scale: 4096.0 | grad norm: 2998.671 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 295/ 292968 | consumed samples: 604160 | consumed tokens: 41205760 | elapsed time per iteration (ms): 86280.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.210630E+00 | loss scale: 4096.0 | grad norm: 3041.224 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 296/ 292968 | consumed samples: 606208 | consumed tokens: 41369600 | elapsed time per iteration (ms): 83008.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.231707E+00 | loss scale: 4096.0 | grad norm: 3376.086 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 297/ 292968 | consumed samples: 608256 | consumed tokens: 41533440 | elapsed time per iteration (ms): 83544.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.230834E+00 | loss scale: 4096.0 | grad norm: 3210.857 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 298/ 292968 | consumed samples: 610304 | consumed tokens: 41697280 | elapsed time per iteration (ms): 85439.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.261968E+00 | loss scale: 4096.0 | grad norm: 2904.531 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 299/ 292968 | consumed samples: 612352 | consumed tokens: 41861120 | elapsed time per iteration (ms): 86039.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.248648E+00 | loss scale: 4096.0 | grad norm: 2478.195 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 300/ 292968 | consumed samples: 614400 | consumed tokens: 42024960 | elapsed time per iteration (ms): 85286.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.300456E+00 | loss scale: 4096.0 | grad norm: 2727.902 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -saving checkpoint at iteration 300 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-23 03:04:06,246] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/mp_rank_00_model_states.pt -[2021-10-23 03:04:06,434] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/mp_rank_01_model_states.pt -[2021-10-23 03:04:20,980] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-23 03:04:21,054] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-23 03:04:21,080] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-23 03:04:21,125] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-23 03:04:21,230] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-23 03:04:21,231] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-23 03:04:21,251] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-23 03:04:21,318] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-23 03:04:21,324] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-23 03:04:21,334] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-23 03:04:21,353] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-23 03:04:21,374] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-23 03:04:21,375] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-23 03:04:21,414] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-23 03:04:21,416] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-23 03:04:21,427] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-23 03:04:21,466] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-23 03:04:21,546] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-23 03:04:21,592] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-23 03:04:21,654] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-23 03:04:21,670] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-23 03:04:21,686] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-23 03:04:21,708] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-23 03:04:21,771] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-23 03:04:21,771] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-23 03:04:21,793] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-23 03:04:21,802] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-23 03:04:21,878] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-23 03:04:21,897] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-23 03:04:21,983] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-23 03:04:21,984] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-23 03:04:22,045] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-23 03:04:22,150] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-23 03:04:22,184] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-23 03:04:22,184] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-23 03:04:22,185] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-23 03:04:22,195] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-23 03:04:22,213] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-23 03:04:22,222] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-23 03:04:22,226] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-23 03:04:22,254] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-23 03:04:22,254] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-23 03:04:22,284] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-23 03:04:22,287] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-23 03:04:22,301] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-23 03:04:22,305] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-23 03:04:22,310] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-23 03:04:22,323] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-23 03:04:22,342] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-23 03:04:22,349] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-23 03:04:22,365] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-23 03:04:22,380] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-23 03:04:22,391] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-23 03:04:22,399] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-23 03:04:22,408] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-23 03:04:22,425] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-23 03:04:22,426] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-23 03:04:22,431] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-23 03:04:22,462] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-23 03:04:22,466] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-23 03:04:22,475] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-23 03:04:22,476] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-23 03:04:22,480] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-23 03:04:22,488] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-23 03:04:22,526] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-23 03:04:22,536] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-23 03:04:22,544] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-23 03:04:22,554] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-23 03:04:22,564] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-23 03:04:22,614] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-23 03:04:22,622] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-23 03:04:22,629] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-23 03:04:22,630] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-23 03:04:22,638] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-23 03:04:22,647] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-23 03:04:22,687] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-23 03:04:22,717] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-23 03:04:22,723] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-23 03:04:22,745] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-23 03:04:22,762] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-23 03:04:22,771] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-23 03:04:22,775] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-23 03:04:22,791] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-23 03:04:22,792] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-23 03:04:22,799] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-23 03:04:22,812] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-23 03:04:22,873] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-23 03:04:22,877] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-23 03:04:22,877] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-23 03:04:22,889] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-23 03:04:22,951] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-23 03:04:22,953] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-23 03:04:22,984] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-23 03:04:22,988] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-23 03:04:23,018] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-23 03:04:23,021] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-23 03:04:23,031] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-23 03:04:23,069] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-23 03:04:23,106] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-23 03:04:23,183] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-23 03:04:23,240] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-23 03:04:23,250] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-23 03:04:23,272] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-23 03:04:23,274] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-23 03:04:23,345] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-23 03:04:23,351] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-23 03:04:23,353] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-23 03:04:23,386] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-23 03:04:23,452] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-23 03:04:23,644] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-23 03:04:23,702] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-23 03:04:23,783] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-23 03:04:24,005] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-23 03:04:24,438] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-23 03:04:24,659] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-23 03:04:25,151] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-23 03:04:25,237] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-23 03:04:25,471] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-23 03:04:25,506] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-23 03:04:25,507] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-23 03:04:29,360] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-23 03:04:30,263] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-23 03:04:30,534] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-23 03:04:31,188] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-23 03:04:31,527] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-23 03:04:32,268] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-23 03:04:41,955] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-23 03:04:42,210] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_124_optim_states.pt - successfully saved checkpoint at iteration 300 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 38930.30 - iteration 301/ 292968 | consumed samples: 616448 | consumed tokens: 42188800 | elapsed time per iteration (ms): 120172.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.294807E+00 | loss scale: 4096.0 | grad norm: 3246.195 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 302/ 292968 | consumed samples: 618496 | consumed tokens: 42352640 | elapsed time per iteration (ms): 83479.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.289993E+00 | loss scale: 4096.0 | grad norm: 3150.027 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 303/ 292968 | consumed samples: 620544 | consumed tokens: 42516480 | elapsed time per iteration (ms): 83523.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.309915E+00 | loss scale: 4096.0 | grad norm: 3078.914 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 304/ 292968 | consumed samples: 622592 | consumed tokens: 42680320 | elapsed time per iteration (ms): 84493.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.317823E+00 | loss scale: 4096.0 | grad norm: 2727.843 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 305/ 292968 | consumed samples: 624640 | consumed tokens: 42844160 | elapsed time per iteration (ms): 85920.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.297845E+00 | loss scale: 4096.0 | grad norm: 3290.396 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 306/ 292968 | consumed samples: 626688 | consumed tokens: 43008000 | elapsed time per iteration (ms): 86838.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.325935E+00 | loss scale: 4096.0 | grad norm: 4053.540 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 307/ 292968 | consumed samples: 628736 | consumed tokens: 43171840 | elapsed time per iteration (ms): 87309.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.318676E+00 | loss scale: 4096.0 | grad norm: 4156.098 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 308/ 292968 | consumed samples: 630784 | consumed tokens: 43335680 | elapsed time per iteration (ms): 86578.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.327109E+00 | loss scale: 4096.0 | grad norm: 3109.435 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 309/ 292968 | consumed samples: 632832 | consumed tokens: 43499520 | elapsed time per iteration (ms): 85783.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.326544E+00 | loss scale: 4096.0 | grad norm: 2555.104 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 310/ 292968 | consumed samples: 634880 | consumed tokens: 43663360 | elapsed time per iteration (ms): 83860.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.320651E+00 | loss scale: 4096.0 | grad norm: 2512.891 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 311/ 292968 | consumed samples: 636928 | consumed tokens: 43827200 | elapsed time per iteration (ms): 83906.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.309007E+00 | loss scale: 4096.0 | grad norm: 3099.864 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 312/ 292968 | consumed samples: 638976 | consumed tokens: 43991040 | elapsed time per iteration (ms): 86905.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.319682E+00 | loss scale: 4096.0 | grad norm: 3241.992 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 313/ 292968 | consumed samples: 641024 | consumed tokens: 44154880 | elapsed time per iteration (ms): 85618.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.297444E+00 | loss scale: 4096.0 | grad norm: 2833.450 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 314/ 292968 | consumed samples: 643072 | consumed tokens: 44318720 | elapsed time per iteration (ms): 85737.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.292383E+00 | loss scale: 4096.0 | grad norm: 2985.790 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 315/ 292968 | consumed samples: 645120 | consumed tokens: 44482560 | elapsed time per iteration (ms): 86433.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.311911E+00 | loss scale: 4096.0 | grad norm: 2443.310 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 316/ 292968 | consumed samples: 647168 | consumed tokens: 44646400 | elapsed time per iteration (ms): 87543.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.292103E+00 | loss scale: 4096.0 | grad norm: 2322.894 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 317/ 292968 | consumed samples: 649216 | consumed tokens: 44810240 | elapsed time per iteration (ms): 84316.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.283496E+00 | loss scale: 4096.0 | grad norm: 3226.758 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 318/ 292968 | consumed samples: 651264 | consumed tokens: 44974080 | elapsed time per iteration (ms): 86117.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.282139E+00 | loss scale: 4096.0 | grad norm: 2866.516 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 319/ 292968 | consumed samples: 653312 | consumed tokens: 45137920 | elapsed time per iteration (ms): 83653.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.307741E+00 | loss scale: 4096.0 | grad norm: 3358.557 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 320/ 292968 | consumed samples: 655360 | consumed tokens: 45301760 | elapsed time per iteration (ms): 84349.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.288731E+00 | loss scale: 4096.0 | grad norm: 3130.875 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 321/ 292968 | consumed samples: 657408 | consumed tokens: 45465600 | elapsed time per iteration (ms): 84950.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.256417E+00 | loss scale: 4096.0 | grad norm: 2886.342 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 322/ 292968 | consumed samples: 659456 | consumed tokens: 45629440 | elapsed time per iteration (ms): 85774.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.265007E+00 | loss scale: 4096.0 | grad norm: 2230.256 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 323/ 292968 | consumed samples: 661504 | consumed tokens: 45793280 | elapsed time per iteration (ms): 85047.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.271173E+00 | loss scale: 4096.0 | grad norm: 1943.179 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 324/ 292968 | consumed samples: 663552 | consumed tokens: 45957120 | elapsed time per iteration (ms): 83635.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.277629E+00 | loss scale: 4096.0 | grad norm: 2270.882 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 325/ 292968 | consumed samples: 665600 | consumed tokens: 46120960 | elapsed time per iteration (ms): 84248.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.308369E+00 | loss scale: 4096.0 | grad norm: 2722.485 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 326/ 292968 | consumed samples: 667648 | consumed tokens: 46284800 | elapsed time per iteration (ms): 85830.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.300281E+00 | loss scale: 4096.0 | grad norm: 2734.466 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 327/ 292968 | consumed samples: 669696 | consumed tokens: 46448640 | elapsed time per iteration (ms): 86014.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.297026E+00 | loss scale: 4096.0 | grad norm: 2485.132 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 328/ 292968 | consumed samples: 671744 | consumed tokens: 46612480 | elapsed time per iteration (ms): 83124.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.289466E+00 | loss scale: 4096.0 | grad norm: 2448.125 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 329/ 292968 | consumed samples: 673792 | consumed tokens: 46776320 | elapsed time per iteration (ms): 85383.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.257540E+00 | loss scale: 4096.0 | grad norm: 2539.504 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 330/ 292968 | consumed samples: 675840 | consumed tokens: 46940160 | elapsed time per iteration (ms): 86007.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.270067E+00 | loss scale: 4096.0 | grad norm: 2835.253 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 331/ 292968 | consumed samples: 677888 | consumed tokens: 47104000 | elapsed time per iteration (ms): 85403.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.268996E+00 | loss scale: 4096.0 | grad norm: 2919.993 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 332/ 292968 | consumed samples: 679936 | consumed tokens: 47267840 | elapsed time per iteration (ms): 85148.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.266261E+00 | loss scale: 4096.0 | grad norm: 2836.807 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 333/ 292968 | consumed samples: 681984 | consumed tokens: 47431680 | elapsed time per iteration (ms): 88408.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.258134E+00 | loss scale: 4096.0 | grad norm: 2617.346 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 334/ 292968 | consumed samples: 684032 | consumed tokens: 47595520 | elapsed time per iteration (ms): 86921.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.302325E+00 | loss scale: 4096.0 | grad norm: 2321.216 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 335/ 292968 | consumed samples: 686080 | consumed tokens: 47759360 | elapsed time per iteration (ms): 84064.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.259216E+00 | loss scale: 4096.0 | grad norm: 2697.261 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 336/ 292968 | consumed samples: 688128 | consumed tokens: 47923200 | elapsed time per iteration (ms): 85304.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.290828E+00 | loss scale: 4096.0 | grad norm: 2570.093 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 337/ 292968 | consumed samples: 690176 | consumed tokens: 48087040 | elapsed time per iteration (ms): 85866.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.274489E+00 | loss scale: 4096.0 | grad norm: 2923.779 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 338/ 292968 | consumed samples: 692224 | consumed tokens: 48250880 | elapsed time per iteration (ms): 86194.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.282946E+00 | loss scale: 4096.0 | grad norm: 2732.406 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 339/ 292968 | consumed samples: 694272 | consumed tokens: 48414720 | elapsed time per iteration (ms): 85088.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.302647E+00 | loss scale: 4096.0 | grad norm: 2639.557 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 340/ 292968 | consumed samples: 696320 | consumed tokens: 48578560 | elapsed time per iteration (ms): 85642.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.275146E+00 | loss scale: 4096.0 | grad norm: 2841.419 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 341/ 292968 | consumed samples: 698368 | consumed tokens: 48742400 | elapsed time per iteration (ms): 83192.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.267980E+00 | loss scale: 4096.0 | grad norm: 3068.109 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 342/ 292968 | consumed samples: 700416 | consumed tokens: 48906240 | elapsed time per iteration (ms): 83856.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.277167E+00 | loss scale: 4096.0 | grad norm: 3140.251 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 343/ 292968 | consumed samples: 702464 | consumed tokens: 49070080 | elapsed time per iteration (ms): 85915.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.289451E+00 | loss scale: 4096.0 | grad norm: 3060.136 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 344/ 292968 | consumed samples: 704512 | consumed tokens: 49233920 | elapsed time per iteration (ms): 87860.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.273150E+00 | loss scale: 4096.0 | grad norm: 2659.173 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 345/ 292968 | consumed samples: 706560 | consumed tokens: 49397760 | elapsed time per iteration (ms): 86748.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.264841E+00 | loss scale: 4096.0 | grad norm: 2356.457 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 346/ 292968 | consumed samples: 708608 | consumed tokens: 49561600 | elapsed time per iteration (ms): 85040.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.283255E+00 | loss scale: 4096.0 | grad norm: 2521.901 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 347/ 292968 | consumed samples: 710656 | consumed tokens: 49725440 | elapsed time per iteration (ms): 85501.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.288081E+00 | loss scale: 4096.0 | grad norm: 2956.512 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 348/ 292968 | consumed samples: 712704 | consumed tokens: 49889280 | elapsed time per iteration (ms): 85639.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.291748E+00 | loss scale: 4096.0 | grad norm: 2382.694 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 349/ 292968 | consumed samples: 714752 | consumed tokens: 50053120 | elapsed time per iteration (ms): 85034.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.250752E+00 | loss scale: 4096.0 | grad norm: 2589.155 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 350/ 292968 | consumed samples: 716800 | consumed tokens: 50216960 | elapsed time per iteration (ms): 88184.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.307037E+00 | loss scale: 4096.0 | grad norm: 2371.714 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 351/ 292968 | consumed samples: 718848 | consumed tokens: 50380800 | elapsed time per iteration (ms): 89218.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.299811E+00 | loss scale: 4096.0 | grad norm: 2118.604 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 352/ 292968 | consumed samples: 720896 | consumed tokens: 50544640 | elapsed time per iteration (ms): 83328.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.285224E+00 | loss scale: 4096.0 | grad norm: 2657.349 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 353/ 292968 | consumed samples: 722944 | consumed tokens: 50708480 | elapsed time per iteration (ms): 84303.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.316021E+00 | loss scale: 4096.0 | grad norm: 3146.009 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 354/ 292968 | consumed samples: 724992 | consumed tokens: 50872320 | elapsed time per iteration (ms): 87196.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.311716E+00 | loss scale: 4096.0 | grad norm: 3500.006 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 355/ 292968 | consumed samples: 727040 | consumed tokens: 51036160 | elapsed time per iteration (ms): 84621.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.331798E+00 | loss scale: 4096.0 | grad norm: 2947.109 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 356/ 292968 | consumed samples: 729088 | consumed tokens: 51200000 | elapsed time per iteration (ms): 83701.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.316767E+00 | loss scale: 4096.0 | grad norm: 2571.337 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 357/ 292968 | consumed samples: 731136 | consumed tokens: 51363840 | elapsed time per iteration (ms): 83996.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.316179E+00 | loss scale: 4096.0 | grad norm: 2556.934 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 358/ 292968 | consumed samples: 733184 | consumed tokens: 51527680 | elapsed time per iteration (ms): 84174.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.335028E+00 | loss scale: 4096.0 | grad norm: 2597.458 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 359/ 292968 | consumed samples: 735232 | consumed tokens: 51691520 | elapsed time per iteration (ms): 83278.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.371193E+00 | loss scale: 4096.0 | grad norm: 3194.708 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 360/ 292968 | consumed samples: 737280 | consumed tokens: 51855360 | elapsed time per iteration (ms): 86356.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.368884E+00 | loss scale: 4096.0 | grad norm: 3641.337 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 361/ 292968 | consumed samples: 739328 | consumed tokens: 52019200 | elapsed time per iteration (ms): 84668.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.346163E+00 | loss scale: 4096.0 | grad norm: 2728.876 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 362/ 292968 | consumed samples: 741376 | consumed tokens: 52183040 | elapsed time per iteration (ms): 85688.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.337568E+00 | loss scale: 4096.0 | grad norm: 2351.216 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 363/ 292968 | consumed samples: 743424 | consumed tokens: 52346880 | elapsed time per iteration (ms): 84108.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.351027E+00 | loss scale: 4096.0 | grad norm: 2375.735 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 364/ 292968 | consumed samples: 745472 | consumed tokens: 52510720 | elapsed time per iteration (ms): 83600.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.356211E+00 | loss scale: 4096.0 | grad norm: 2592.961 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 365/ 292968 | consumed samples: 747520 | consumed tokens: 52674560 | elapsed time per iteration (ms): 84340.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.343930E+00 | loss scale: 4096.0 | grad norm: 2528.956 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 366/ 292968 | consumed samples: 749568 | consumed tokens: 52838400 | elapsed time per iteration (ms): 83115.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.368711E+00 | loss scale: 4096.0 | grad norm: 2997.792 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 367/ 292968 | consumed samples: 751616 | consumed tokens: 53002240 | elapsed time per iteration (ms): 84848.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.410151E+00 | loss scale: 4096.0 | grad norm: 2645.993 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 368/ 292968 | consumed samples: 753664 | consumed tokens: 53166080 | elapsed time per iteration (ms): 86241.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.401436E+00 | loss scale: 4096.0 | grad norm: 2795.852 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 369/ 292968 | consumed samples: 755712 | consumed tokens: 53329920 | elapsed time per iteration (ms): 83793.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.411240E+00 | loss scale: 4096.0 | grad norm: 3158.218 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 370/ 292968 | consumed samples: 757760 | consumed tokens: 53493760 | elapsed time per iteration (ms): 82670.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.416414E+00 | loss scale: 4096.0 | grad norm: 3766.280 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 371/ 292968 | consumed samples: 759808 | consumed tokens: 53657600 | elapsed time per iteration (ms): 84074.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.425708E+00 | loss scale: 4096.0 | grad norm: 2850.252 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 372/ 292968 | consumed samples: 761856 | consumed tokens: 53821440 | elapsed time per iteration (ms): 84306.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.449362E+00 | loss scale: 4096.0 | grad norm: 2335.863 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 373/ 292968 | consumed samples: 763904 | consumed tokens: 53985280 | elapsed time per iteration (ms): 85845.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.472668E+00 | loss scale: 4096.0 | grad norm: 3089.961 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 374/ 292968 | consumed samples: 765952 | consumed tokens: 54149120 | elapsed time per iteration (ms): 84241.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.421590E+00 | loss scale: 4096.0 | grad norm: 2485.059 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 375/ 292968 | consumed samples: 768000 | consumed tokens: 54312960 | elapsed time per iteration (ms): 82698.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.442699E+00 | loss scale: 4096.0 | grad norm: 2410.260 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 376/ 292968 | consumed samples: 770048 | consumed tokens: 54476800 | elapsed time per iteration (ms): 84554.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.475940E+00 | loss scale: 4096.0 | grad norm: 3077.124 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 377/ 292968 | consumed samples: 772096 | consumed tokens: 54640640 | elapsed time per iteration (ms): 83407.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.452914E+00 | loss scale: 4096.0 | grad norm: 3540.172 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 378/ 292968 | consumed samples: 774144 | consumed tokens: 54804480 | elapsed time per iteration (ms): 84717.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.465997E+00 | loss scale: 4096.0 | grad norm: 3261.752 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 379/ 292968 | consumed samples: 776192 | consumed tokens: 54968320 | elapsed time per iteration (ms): 84071.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.455918E+00 | loss scale: 4096.0 | grad norm: 2392.851 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 380/ 292968 | consumed samples: 778240 | consumed tokens: 55132160 | elapsed time per iteration (ms): 84090.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.469865E+00 | loss scale: 4096.0 | grad norm: 2634.668 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 381/ 292968 | consumed samples: 780288 | consumed tokens: 55296000 | elapsed time per iteration (ms): 82252.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.465951E+00 | loss scale: 4096.0 | grad norm: 3061.322 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 382/ 292968 | consumed samples: 782336 | consumed tokens: 55459840 | elapsed time per iteration (ms): 83945.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.471965E+00 | loss scale: 4096.0 | grad norm: 2375.583 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 383/ 292968 | consumed samples: 784384 | consumed tokens: 55623680 | elapsed time per iteration (ms): 83893.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.479254E+00 | loss scale: 4096.0 | grad norm: 3209.832 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 384/ 292968 | consumed samples: 786432 | consumed tokens: 55787520 | elapsed time per iteration (ms): 82575.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.511918E+00 | loss scale: 4096.0 | grad norm: 3857.875 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 385/ 292968 | consumed samples: 788480 | consumed tokens: 55951360 | elapsed time per iteration (ms): 83048.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.485046E+00 | loss scale: 4096.0 | grad norm: 3819.017 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 386/ 292968 | consumed samples: 790528 | consumed tokens: 56115200 | elapsed time per iteration (ms): 82493.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.487671E+00 | loss scale: 4096.0 | grad norm: 3008.471 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 387/ 292968 | consumed samples: 792576 | consumed tokens: 56279040 | elapsed time per iteration (ms): 83064.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.499905E+00 | loss scale: 4096.0 | grad norm: 2885.645 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 388/ 292968 | consumed samples: 794624 | consumed tokens: 56442880 | elapsed time per iteration (ms): 84599.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.512537E+00 | loss scale: 4096.0 | grad norm: 2435.791 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 389/ 292968 | consumed samples: 796672 | consumed tokens: 56606720 | elapsed time per iteration (ms): 85335.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.518743E+00 | loss scale: 4096.0 | grad norm: 2630.157 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 390/ 292968 | consumed samples: 798720 | consumed tokens: 56770560 | elapsed time per iteration (ms): 82290.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.517368E+00 | loss scale: 4096.0 | grad norm: 2854.273 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 391/ 292968 | consumed samples: 800768 | consumed tokens: 56934400 | elapsed time per iteration (ms): 82574.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.495354E+00 | loss scale: 4096.0 | grad norm: 2770.231 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 392/ 292968 | consumed samples: 802816 | consumed tokens: 57098240 | elapsed time per iteration (ms): 82653.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.506156E+00 | loss scale: 4096.0 | grad norm: 2872.162 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 393/ 292968 | consumed samples: 804864 | consumed tokens: 57262080 | elapsed time per iteration (ms): 86498.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.494565E+00 | loss scale: 4096.0 | grad norm: 2958.523 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 394/ 292968 | consumed samples: 806912 | consumed tokens: 57425920 | elapsed time per iteration (ms): 85210.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.517799E+00 | loss scale: 4096.0 | grad norm: 2400.468 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 395/ 292968 | consumed samples: 808960 | consumed tokens: 57589760 | elapsed time per iteration (ms): 83511.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.520504E+00 | loss scale: 4096.0 | grad norm: 3047.666 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 396/ 292968 | consumed samples: 811008 | consumed tokens: 57753600 | elapsed time per iteration (ms): 86446.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.511170E+00 | loss scale: 4096.0 | grad norm: 2652.860 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 397/ 292968 | consumed samples: 813056 | consumed tokens: 57917440 | elapsed time per iteration (ms): 85898.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.523363E+00 | loss scale: 4096.0 | grad norm: 2134.214 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 398/ 292968 | consumed samples: 815104 | consumed tokens: 58081280 | elapsed time per iteration (ms): 84745.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.516827E+00 | loss scale: 4096.0 | grad norm: 2659.799 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 399/ 292968 | consumed samples: 817152 | consumed tokens: 58245120 | elapsed time per iteration (ms): 83283.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.531635E+00 | loss scale: 4096.0 | grad norm: 2508.139 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 400/ 292968 | consumed samples: 819200 | consumed tokens: 58408960 | elapsed time per iteration (ms): 85683.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.541448E+00 | loss scale: 4096.0 | grad norm: 2755.752 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 401/ 292968 | consumed samples: 821248 | consumed tokens: 58572800 | elapsed time per iteration (ms): 83885.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.530828E+00 | loss scale: 4096.0 | grad norm: 2583.181 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 402/ 292968 | consumed samples: 823296 | consumed tokens: 58736640 | elapsed time per iteration (ms): 82292.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.518544E+00 | loss scale: 4096.0 | grad norm: 2114.473 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 403/ 292968 | consumed samples: 825344 | consumed tokens: 58900480 | elapsed time per iteration (ms): 85167.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.506803E+00 | loss scale: 4096.0 | grad norm: 2327.567 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 404/ 292968 | consumed samples: 827392 | consumed tokens: 59064320 | elapsed time per iteration (ms): 84647.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.552835E+00 | loss scale: 4096.0 | grad norm: 2704.045 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 405/ 292968 | consumed samples: 829440 | consumed tokens: 59228160 | elapsed time per iteration (ms): 84024.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.516169E+00 | loss scale: 4096.0 | grad norm: 1984.430 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 406/ 292968 | consumed samples: 831488 | consumed tokens: 59392000 | elapsed time per iteration (ms): 85240.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.534061E+00 | loss scale: 4096.0 | grad norm: 2499.451 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 407/ 292968 | consumed samples: 833536 | consumed tokens: 59555840 | elapsed time per iteration (ms): 81937.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.538692E+00 | loss scale: 4096.0 | grad norm: 2276.613 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 408/ 292968 | consumed samples: 835584 | consumed tokens: 59719680 | elapsed time per iteration (ms): 82352.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.527668E+00 | loss scale: 4096.0 | grad norm: 2121.233 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 409/ 292968 | consumed samples: 837632 | consumed tokens: 59883520 | elapsed time per iteration (ms): 84009.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.523369E+00 | loss scale: 4096.0 | grad norm: 2322.948 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 410/ 292968 | consumed samples: 839680 | consumed tokens: 60047360 | elapsed time per iteration (ms): 84533.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.517059E+00 | loss scale: 4096.0 | grad norm: 2574.142 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 411/ 292968 | consumed samples: 841728 | consumed tokens: 60211200 | elapsed time per iteration (ms): 82840.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.518081E+00 | loss scale: 4096.0 | grad norm: 2067.488 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 412/ 292968 | consumed samples: 843776 | consumed tokens: 60375040 | elapsed time per iteration (ms): 82612.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.550312E+00 | loss scale: 4096.0 | grad norm: 3038.160 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 413/ 292968 | consumed samples: 845824 | consumed tokens: 60538880 | elapsed time per iteration (ms): 81753.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.523160E+00 | loss scale: 4096.0 | grad norm: 2323.494 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 414/ 292968 | consumed samples: 847872 | consumed tokens: 60702720 | elapsed time per iteration (ms): 83112.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.526362E+00 | loss scale: 4096.0 | grad norm: 2254.803 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 415/ 292968 | consumed samples: 849920 | consumed tokens: 60866560 | elapsed time per iteration (ms): 83567.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.556863E+00 | loss scale: 4096.0 | grad norm: 2247.028 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 416/ 292968 | consumed samples: 851968 | consumed tokens: 61030400 | elapsed time per iteration (ms): 85052.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.529713E+00 | loss scale: 4096.0 | grad norm: 2077.980 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 417/ 292968 | consumed samples: 854016 | consumed tokens: 61194240 | elapsed time per iteration (ms): 84672.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.541699E+00 | loss scale: 4096.0 | grad norm: 2337.793 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 418/ 292968 | consumed samples: 856064 | consumed tokens: 61358080 | elapsed time per iteration (ms): 81752.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.533653E+00 | loss scale: 4096.0 | grad norm: 2327.654 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 419/ 292968 | consumed samples: 858112 | consumed tokens: 61521920 | elapsed time per iteration (ms): 85962.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.535294E+00 | loss scale: 4096.0 | grad norm: 2374.483 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 420/ 292968 | consumed samples: 860160 | consumed tokens: 61685760 | elapsed time per iteration (ms): 81962.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.532289E+00 | loss scale: 4096.0 | grad norm: 1907.133 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 421/ 292968 | consumed samples: 862208 | consumed tokens: 61849600 | elapsed time per iteration (ms): 82776.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.534570E+00 | loss scale: 4096.0 | grad norm: 2169.051 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 422/ 292968 | consumed samples: 864256 | consumed tokens: 62013440 | elapsed time per iteration (ms): 82119.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.526681E+00 | loss scale: 4096.0 | grad norm: 2113.544 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 423/ 292968 | consumed samples: 866304 | consumed tokens: 62177280 | elapsed time per iteration (ms): 82975.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.536790E+00 | loss scale: 4096.0 | grad norm: 2054.942 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 424/ 292968 | consumed samples: 868352 | consumed tokens: 62341120 | elapsed time per iteration (ms): 83629.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.527515E+00 | loss scale: 4096.0 | grad norm: 2169.183 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 425/ 292968 | consumed samples: 870400 | consumed tokens: 62504960 | elapsed time per iteration (ms): 83014.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.530001E+00 | loss scale: 4096.0 | grad norm: 2515.974 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 426/ 292968 | consumed samples: 872448 | consumed tokens: 62668800 | elapsed time per iteration (ms): 84796.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.526751E+00 | loss scale: 4096.0 | grad norm: 2476.350 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 427/ 292968 | consumed samples: 874496 | consumed tokens: 62832640 | elapsed time per iteration (ms): 84535.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.541574E+00 | loss scale: 4096.0 | grad norm: 2966.665 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 428/ 292968 | consumed samples: 876544 | consumed tokens: 62996480 | elapsed time per iteration (ms): 82680.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.522227E+00 | loss scale: 4096.0 | grad norm: 2050.132 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 429/ 292968 | consumed samples: 878592 | consumed tokens: 63160320 | elapsed time per iteration (ms): 84921.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.515896E+00 | loss scale: 4096.0 | grad norm: 2198.395 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 430/ 292968 | consumed samples: 880640 | consumed tokens: 63324160 | elapsed time per iteration (ms): 84952.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.530925E+00 | loss scale: 4096.0 | grad norm: 2780.993 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 431/ 292968 | consumed samples: 882688 | consumed tokens: 63488000 | elapsed time per iteration (ms): 84583.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.510372E+00 | loss scale: 4096.0 | grad norm: 2142.460 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 432/ 292968 | consumed samples: 884736 | consumed tokens: 63651840 | elapsed time per iteration (ms): 83553.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.528667E+00 | loss scale: 4096.0 | grad norm: 2177.107 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 433/ 292968 | consumed samples: 886784 | consumed tokens: 63815680 | elapsed time per iteration (ms): 85517.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.522967E+00 | loss scale: 4096.0 | grad norm: 2182.786 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 434/ 292968 | consumed samples: 888832 | consumed tokens: 63979520 | elapsed time per iteration (ms): 82023.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.531892E+00 | loss scale: 4096.0 | grad norm: 1939.569 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 435/ 292968 | consumed samples: 890880 | consumed tokens: 64143360 | elapsed time per iteration (ms): 83552.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.515239E+00 | loss scale: 4096.0 | grad norm: 1870.309 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 436/ 292968 | consumed samples: 892928 | consumed tokens: 64323584 | elapsed time per iteration (ms): 83843.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.543612E+00 | loss scale: 4096.0 | grad norm: 2736.532 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 437/ 292968 | consumed samples: 894976 | consumed tokens: 64503808 | elapsed time per iteration (ms): 81961.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.546953E+00 | loss scale: 4096.0 | grad norm: 2299.948 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 438/ 292968 | consumed samples: 897024 | consumed tokens: 64684032 | elapsed time per iteration (ms): 81119.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.544587E+00 | loss scale: 4096.0 | grad norm: 2142.414 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 439/ 292968 | consumed samples: 899072 | consumed tokens: 64864256 | elapsed time per iteration (ms): 83531.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.538731E+00 | loss scale: 4096.0 | grad norm: 3027.466 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 440/ 292968 | consumed samples: 901120 | consumed tokens: 65044480 | elapsed time per iteration (ms): 83017.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.547732E+00 | loss scale: 4096.0 | grad norm: 2559.613 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 441/ 292968 | consumed samples: 903168 | consumed tokens: 65224704 | elapsed time per iteration (ms): 82447.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.546313E+00 | loss scale: 4096.0 | grad norm: 2947.133 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 442/ 292968 | consumed samples: 905216 | consumed tokens: 65404928 | elapsed time per iteration (ms): 80803.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.534683E+00 | loss scale: 4096.0 | grad norm: 2390.260 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 443/ 292968 | consumed samples: 907264 | consumed tokens: 65585152 | elapsed time per iteration (ms): 83593.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.536517E+00 | loss scale: 4096.0 | grad norm: 3007.819 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 444/ 292968 | consumed samples: 909312 | consumed tokens: 65765376 | elapsed time per iteration (ms): 83201.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.533750E+00 | loss scale: 4096.0 | grad norm: 2487.030 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 445/ 292968 | consumed samples: 911360 | consumed tokens: 65945600 | elapsed time per iteration (ms): 83186.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.557394E+00 | loss scale: 4096.0 | grad norm: 2338.218 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 446/ 292968 | consumed samples: 913408 | consumed tokens: 66125824 | elapsed time per iteration (ms): 83538.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.532371E+00 | loss scale: 4096.0 | grad norm: 2322.910 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 447/ 292968 | consumed samples: 915456 | consumed tokens: 66306048 | elapsed time per iteration (ms): 82191.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.540046E+00 | loss scale: 4096.0 | grad norm: 2242.005 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 448/ 292968 | consumed samples: 917504 | consumed tokens: 66486272 | elapsed time per iteration (ms): 82143.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.523855E+00 | loss scale: 4096.0 | grad norm: 2266.417 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 449/ 292968 | consumed samples: 919552 | consumed tokens: 66666496 | elapsed time per iteration (ms): 82324.8 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.546436E+00 | loss scale: 4096.0 | grad norm: 2474.338 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 450/ 292968 | consumed samples: 921600 | consumed tokens: 66846720 | elapsed time per iteration (ms): 82405.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.513302E+00 | loss scale: 4096.0 | grad norm: 2809.811 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 451/ 292968 | consumed samples: 923648 | consumed tokens: 67026944 | elapsed time per iteration (ms): 82717.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.553439E+00 | loss scale: 4096.0 | grad norm: 2027.531 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 452/ 292968 | consumed samples: 925696 | consumed tokens: 67207168 | elapsed time per iteration (ms): 81541.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.565411E+00 | loss scale: 4096.0 | grad norm: 2263.155 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 453/ 292968 | consumed samples: 927744 | consumed tokens: 67387392 | elapsed time per iteration (ms): 83757.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.544087E+00 | loss scale: 4096.0 | grad norm: 2265.025 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 454/ 292968 | consumed samples: 929792 | consumed tokens: 67567616 | elapsed time per iteration (ms): 82882.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.557709E+00 | loss scale: 4096.0 | grad norm: 2257.113 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 455/ 292968 | consumed samples: 931840 | consumed tokens: 67747840 | elapsed time per iteration (ms): 81521.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.558704E+00 | loss scale: 4096.0 | grad norm: 2500.031 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 456/ 292968 | consumed samples: 933888 | consumed tokens: 67928064 | elapsed time per iteration (ms): 82926.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.543804E+00 | loss scale: 4096.0 | grad norm: 3298.771 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 457/ 292968 | consumed samples: 935936 | consumed tokens: 68108288 | elapsed time per iteration (ms): 83063.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.546052E+00 | loss scale: 4096.0 | grad norm: 2237.762 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 458/ 292968 | consumed samples: 937984 | consumed tokens: 68288512 | elapsed time per iteration (ms): 82631.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.547925E+00 | loss scale: 4096.0 | grad norm: 3112.005 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 459/ 292968 | consumed samples: 940032 | consumed tokens: 68468736 | elapsed time per iteration (ms): 83045.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.547452E+00 | loss scale: 4096.0 | grad norm: 2249.644 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 460/ 292968 | consumed samples: 942080 | consumed tokens: 68648960 | elapsed time per iteration (ms): 82647.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.555418E+00 | loss scale: 4096.0 | grad norm: 2187.525 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 461/ 292968 | consumed samples: 944128 | consumed tokens: 68829184 | elapsed time per iteration (ms): 82736.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.577223E+00 | loss scale: 4096.0 | grad norm: 2624.831 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 462/ 292968 | consumed samples: 946176 | consumed tokens: 69009408 | elapsed time per iteration (ms): 81759.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.562142E+00 | loss scale: 4096.0 | grad norm: 2271.203 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 463/ 292968 | consumed samples: 948224 | consumed tokens: 69189632 | elapsed time per iteration (ms): 83934.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.565150E+00 | loss scale: 4096.0 | grad norm: 2573.933 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 464/ 292968 | consumed samples: 950272 | consumed tokens: 69369856 | elapsed time per iteration (ms): 85192.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.569502E+00 | loss scale: 4096.0 | grad norm: 2157.316 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 465/ 292968 | consumed samples: 952320 | consumed tokens: 69550080 | elapsed time per iteration (ms): 84180.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.565602E+00 | loss scale: 4096.0 | grad norm: 2110.637 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 466/ 292968 | consumed samples: 954368 | consumed tokens: 69730304 | elapsed time per iteration (ms): 83216.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.554029E+00 | loss scale: 4096.0 | grad norm: 2215.039 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 467/ 292968 | consumed samples: 956416 | consumed tokens: 69910528 | elapsed time per iteration (ms): 81086.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.580177E+00 | loss scale: 4096.0 | grad norm: 2526.559 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 468/ 292968 | consumed samples: 958464 | consumed tokens: 70090752 | elapsed time per iteration (ms): 81543.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.575295E+00 | loss scale: 4096.0 | grad norm: 2435.336 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 469/ 292968 | consumed samples: 960512 | consumed tokens: 70270976 | elapsed time per iteration (ms): 83995.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.556109E+00 | loss scale: 4096.0 | grad norm: 2854.660 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 470/ 292968 | consumed samples: 962560 | consumed tokens: 70451200 | elapsed time per iteration (ms): 82368.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.568743E+00 | loss scale: 4096.0 | grad norm: 3487.078 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 471/ 292968 | consumed samples: 964608 | consumed tokens: 70631424 | elapsed time per iteration (ms): 81649.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.580371E+00 | loss scale: 4096.0 | grad norm: 2771.347 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 472/ 292968 | consumed samples: 966656 | consumed tokens: 70811648 | elapsed time per iteration (ms): 83683.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.581263E+00 | loss scale: 4096.0 | grad norm: 2186.138 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 473/ 292968 | consumed samples: 968704 | consumed tokens: 70991872 | elapsed time per iteration (ms): 80961.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.602887E+00 | loss scale: 4096.0 | grad norm: 2181.590 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 474/ 292968 | consumed samples: 970752 | consumed tokens: 71172096 | elapsed time per iteration (ms): 82963.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.617973E+00 | loss scale: 4096.0 | grad norm: 2880.327 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 475/ 292968 | consumed samples: 972800 | consumed tokens: 71352320 | elapsed time per iteration (ms): 82408.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.605627E+00 | loss scale: 4096.0 | grad norm: 2176.481 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 476/ 292968 | consumed samples: 974848 | consumed tokens: 71532544 | elapsed time per iteration (ms): 82974.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.610926E+00 | loss scale: 4096.0 | grad norm: 2668.008 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 477/ 292968 | consumed samples: 976896 | consumed tokens: 71712768 | elapsed time per iteration (ms): 83862.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.605448E+00 | loss scale: 4096.0 | grad norm: 2785.464 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 478/ 292968 | consumed samples: 978944 | consumed tokens: 71892992 | elapsed time per iteration (ms): 81074.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.610027E+00 | loss scale: 4096.0 | grad norm: 2785.927 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 479/ 292968 | consumed samples: 980992 | consumed tokens: 72073216 | elapsed time per iteration (ms): 82798.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.619042E+00 | loss scale: 4096.0 | grad norm: 2252.657 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 480/ 292968 | consumed samples: 983040 | consumed tokens: 72253440 | elapsed time per iteration (ms): 82822.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.612586E+00 | loss scale: 4096.0 | grad norm: 2413.833 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 481/ 292968 | consumed samples: 985088 | consumed tokens: 72433664 | elapsed time per iteration (ms): 83304.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.640561E+00 | loss scale: 4096.0 | grad norm: 2441.591 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 482/ 292968 | consumed samples: 987136 | consumed tokens: 72613888 | elapsed time per iteration (ms): 84879.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.611023E+00 | loss scale: 4096.0 | grad norm: 2342.209 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 483/ 292968 | consumed samples: 989184 | consumed tokens: 72794112 | elapsed time per iteration (ms): 83521.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.647702E+00 | loss scale: 4096.0 | grad norm: 2009.804 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 484/ 292968 | consumed samples: 991232 | consumed tokens: 72974336 | elapsed time per iteration (ms): 85304.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.615279E+00 | loss scale: 4096.0 | grad norm: 2431.016 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 485/ 292968 | consumed samples: 993280 | consumed tokens: 73154560 | elapsed time per iteration (ms): 83221.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.617563E+00 | loss scale: 4096.0 | grad norm: 2332.468 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 486/ 292968 | consumed samples: 995328 | consumed tokens: 73334784 | elapsed time per iteration (ms): 84452.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.642996E+00 | loss scale: 4096.0 | grad norm: 2293.045 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 487/ 292968 | consumed samples: 997376 | consumed tokens: 73515008 | elapsed time per iteration (ms): 81694.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.640726E+00 | loss scale: 4096.0 | grad norm: 2161.555 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 488/ 292968 | consumed samples: 999424 | consumed tokens: 73695232 | elapsed time per iteration (ms): 82457.2 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.646109E+00 | loss scale: 4096.0 | grad norm: 1998.380 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 489/ 292968 | consumed samples: 1001472 | consumed tokens: 73875456 | elapsed time per iteration (ms): 82638.6 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.615416E+00 | loss scale: 4096.0 | grad norm: 2314.776 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 490/ 292968 | consumed samples: 1003520 | consumed tokens: 74055680 | elapsed time per iteration (ms): 84874.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.638342E+00 | loss scale: 4096.0 | grad norm: 2012.102 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 491/ 292968 | consumed samples: 1005568 | consumed tokens: 74235904 | elapsed time per iteration (ms): 81954.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.672338E+00 | loss scale: 4096.0 | grad norm: 2193.039 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 492/ 292968 | consumed samples: 1007616 | consumed tokens: 74416128 | elapsed time per iteration (ms): 82728.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.638321E+00 | loss scale: 4096.0 | grad norm: 2302.749 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 493/ 292968 | consumed samples: 1009664 | consumed tokens: 74596352 | elapsed time per iteration (ms): 82350.0 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.653313E+00 | loss scale: 4096.0 | grad norm: 2344.943 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 494/ 292968 | consumed samples: 1011712 | consumed tokens: 74776576 | elapsed time per iteration (ms): 81785.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.657078E+00 | loss scale: 4096.0 | grad norm: 2214.307 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 495/ 292968 | consumed samples: 1013760 | consumed tokens: 74956800 | elapsed time per iteration (ms): 82994.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.647707E+00 | loss scale: 4096.0 | grad norm: 2218.280 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 496/ 292968 | consumed samples: 1015808 | consumed tokens: 75137024 | elapsed time per iteration (ms): 81579.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.651341E+00 | loss scale: 4096.0 | grad norm: 2290.442 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 497/ 292968 | consumed samples: 1017856 | consumed tokens: 75317248 | elapsed time per iteration (ms): 82780.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.628348E+00 | loss scale: 4096.0 | grad norm: 2732.969 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 498/ 292968 | consumed samples: 1019904 | consumed tokens: 75497472 | elapsed time per iteration (ms): 80775.5 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.661420E+00 | loss scale: 4096.0 | grad norm: 2730.811 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 499/ 292968 | consumed samples: 1021952 | consumed tokens: 75677696 | elapsed time per iteration (ms): 82615.9 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.653955E+00 | loss scale: 4096.0 | grad norm: 2656.733 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 500/ 292968 | consumed samples: 1024000 | consumed tokens: 75857920 | elapsed time per iteration (ms): 83554.1 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.635319E+00 | loss scale: 8192.0 | grad norm: 2675.817 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 501/ 292968 | consumed samples: 1026048 | consumed tokens: 76038144 | elapsed time per iteration (ms): 87892.4 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.641971E+00 | loss scale: 8192.0 | grad norm: 6771.167 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 502/ 292968 | consumed samples: 1028096 | consumed tokens: 76218368 | elapsed time per iteration (ms): 86764.3 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.644740E+00 | loss scale: 8192.0 | grad norm: 30491.498 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 503/ 292968 | consumed samples: 1030144 | consumed tokens: 76398592 | elapsed time per iteration (ms): 85440.7 | learning rate: 6.000E-05 | global batch size: 2048 | lm loss: 7.629947E+00 | loss scale: 8192.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 504/ 292968 | consumed samples: 1032192 | consumed tokens: 76578816 | elapsed time per iteration (ms): 82573.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 8192.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 505/ 292968 | consumed samples: 1034240 | consumed tokens: 76759040 | elapsed time per iteration (ms): 83552.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 4096.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 506/ 292968 | consumed samples: 1036288 | consumed tokens: 76939264 | elapsed time per iteration (ms): 81676.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 2048.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 507/ 292968 | consumed samples: 1038336 | consumed tokens: 77119488 | elapsed time per iteration (ms): 82681.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1024.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 508/ 292968 | consumed samples: 1040384 | consumed tokens: 77299712 | elapsed time per iteration (ms): 81912.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 512.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 509/ 292968 | consumed samples: 1042432 | consumed tokens: 77479936 | elapsed time per iteration (ms): 82860.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 256.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 510/ 292968 | consumed samples: 1044480 | consumed tokens: 77660160 | elapsed time per iteration (ms): 82844.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 128.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 511/ 292968 | consumed samples: 1046528 | consumed tokens: 77840384 | elapsed time per iteration (ms): 81030.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 64.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 512/ 292968 | consumed samples: 1048576 | consumed tokens: 78020608 | elapsed time per iteration (ms): 84777.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 32.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 513/ 292968 | consumed samples: 1050624 | consumed tokens: 78200832 | elapsed time per iteration (ms): 81941.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 16.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 514/ 292968 | consumed samples: 1052672 | consumed tokens: 78381056 | elapsed time per iteration (ms): 81258.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 8.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 515/ 292968 | consumed samples: 1054720 | consumed tokens: 78561280 | elapsed time per iteration (ms): 82462.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 4.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 516/ 292968 | consumed samples: 1056768 | consumed tokens: 78741504 | elapsed time per iteration (ms): 80771.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 2.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 517/ 292968 | consumed samples: 1058816 | consumed tokens: 78921728 | elapsed time per iteration (ms): 81862.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 518/ 292968 | consumed samples: 1060864 | consumed tokens: 79101952 | elapsed time per iteration (ms): 82394.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 519/ 292968 | consumed samples: 1062912 | consumed tokens: 79282176 | elapsed time per iteration (ms): 82854.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 520/ 292968 | consumed samples: 1064960 | consumed tokens: 79462400 | elapsed time per iteration (ms): 82895.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 521/ 292968 | consumed samples: 1067008 | consumed tokens: 79642624 | elapsed time per iteration (ms): 81145.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 522/ 292968 | consumed samples: 1069056 | consumed tokens: 79822848 | elapsed time per iteration (ms): 81517.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 523/ 292968 | consumed samples: 1071104 | consumed tokens: 80003072 | elapsed time per iteration (ms): 81345.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 524/ 292968 | consumed samples: 1073152 | consumed tokens: 80183296 | elapsed time per iteration (ms): 81761.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 525/ 292968 | consumed samples: 1075200 | consumed tokens: 80363520 | elapsed time per iteration (ms): 84448.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 526/ 292968 | consumed samples: 1077248 | consumed tokens: 80543744 | elapsed time per iteration (ms): 82562.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 527/ 292968 | consumed samples: 1079296 | consumed tokens: 80723968 | elapsed time per iteration (ms): 83943.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 528/ 292968 | consumed samples: 1081344 | consumed tokens: 80904192 | elapsed time per iteration (ms): 81453.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 529/ 292968 | consumed samples: 1083392 | consumed tokens: 81084416 | elapsed time per iteration (ms): 83728.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 530/ 292968 | consumed samples: 1085440 | consumed tokens: 81264640 | elapsed time per iteration (ms): 81894.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 531/ 292968 | consumed samples: 1087488 | consumed tokens: 81444864 | elapsed time per iteration (ms): 81132.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 532/ 292968 | consumed samples: 1089536 | consumed tokens: 81625088 | elapsed time per iteration (ms): 82118.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 533/ 292968 | consumed samples: 1091584 | consumed tokens: 81805312 | elapsed time per iteration (ms): 82287.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 534/ 292968 | consumed samples: 1093632 | consumed tokens: 81985536 | elapsed time per iteration (ms): 81966.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 535/ 292968 | consumed samples: 1095680 | consumed tokens: 82165760 | elapsed time per iteration (ms): 84694.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 536/ 292968 | consumed samples: 1097728 | consumed tokens: 82345984 | elapsed time per iteration (ms): 83780.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 537/ 292968 | consumed samples: 1099776 | consumed tokens: 82526208 | elapsed time per iteration (ms): 82531.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 538/ 292968 | consumed samples: 1101824 | consumed tokens: 82706432 | elapsed time per iteration (ms): 82517.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 539/ 292968 | consumed samples: 1103872 | consumed tokens: 82886656 | elapsed time per iteration (ms): 82328.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 540/ 292968 | consumed samples: 1105920 | consumed tokens: 83066880 | elapsed time per iteration (ms): 81576.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 541/ 292968 | consumed samples: 1107968 | consumed tokens: 83247104 | elapsed time per iteration (ms): 83862.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 542/ 292968 | consumed samples: 1110016 | consumed tokens: 83427328 | elapsed time per iteration (ms): 82443.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 543/ 292968 | consumed samples: 1112064 | consumed tokens: 83607552 | elapsed time per iteration (ms): 82301.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 544/ 292968 | consumed samples: 1114112 | consumed tokens: 83787776 | elapsed time per iteration (ms): 83217.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 545/ 292968 | consumed samples: 1116160 | consumed tokens: 83968000 | elapsed time per iteration (ms): 85001.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 546/ 292968 | consumed samples: 1118208 | consumed tokens: 84148224 | elapsed time per iteration (ms): 83602.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 547/ 292968 | consumed samples: 1120256 | consumed tokens: 84328448 | elapsed time per iteration (ms): 85923.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 548/ 292968 | consumed samples: 1122304 | consumed tokens: 84508672 | elapsed time per iteration (ms): 83048.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 549/ 292968 | consumed samples: 1124352 | consumed tokens: 84688896 | elapsed time per iteration (ms): 82460.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 550/ 292968 | consumed samples: 1126400 | consumed tokens: 84869120 | elapsed time per iteration (ms): 80644.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 551/ 292968 | consumed samples: 1128448 | consumed tokens: 85049344 | elapsed time per iteration (ms): 81005.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 552/ 292968 | consumed samples: 1130496 | consumed tokens: 85229568 | elapsed time per iteration (ms): 84502.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 553/ 292968 | consumed samples: 1132544 | consumed tokens: 85409792 | elapsed time per iteration (ms): 82098.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 554/ 292968 | consumed samples: 1134592 | consumed tokens: 85590016 | elapsed time per iteration (ms): 83050.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 555/ 292968 | consumed samples: 1136640 | consumed tokens: 85770240 | elapsed time per iteration (ms): 82094.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 556/ 292968 | consumed samples: 1138688 | consumed tokens: 85950464 | elapsed time per iteration (ms): 81973.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 557/ 292968 | consumed samples: 1140736 | consumed tokens: 86130688 | elapsed time per iteration (ms): 80547.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 558/ 292968 | consumed samples: 1142784 | consumed tokens: 86310912 | elapsed time per iteration (ms): 81927.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 559/ 292968 | consumed samples: 1144832 | consumed tokens: 86491136 | elapsed time per iteration (ms): 82924.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 560/ 292968 | consumed samples: 1146880 | consumed tokens: 86671360 | elapsed time per iteration (ms): 81532.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 561/ 292968 | consumed samples: 1148928 | consumed tokens: 86851584 | elapsed time per iteration (ms): 84191.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 562/ 292968 | consumed samples: 1150976 | consumed tokens: 87031808 | elapsed time per iteration (ms): 84471.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 563/ 292968 | consumed samples: 1153024 | consumed tokens: 87212032 | elapsed time per iteration (ms): 84513.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 564/ 292968 | consumed samples: 1155072 | consumed tokens: 87392256 | elapsed time per iteration (ms): 82698.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 565/ 292968 | consumed samples: 1157120 | consumed tokens: 87572480 | elapsed time per iteration (ms): 81478.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 566/ 292968 | consumed samples: 1159168 | consumed tokens: 87752704 | elapsed time per iteration (ms): 81842.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 567/ 292968 | consumed samples: 1161216 | consumed tokens: 87932928 | elapsed time per iteration (ms): 84425.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 568/ 292968 | consumed samples: 1163264 | consumed tokens: 88113152 | elapsed time per iteration (ms): 82715.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 569/ 292968 | consumed samples: 1165312 | consumed tokens: 88293376 | elapsed time per iteration (ms): 83499.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 570/ 292968 | consumed samples: 1167360 | consumed tokens: 88473600 | elapsed time per iteration (ms): 82851.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 571/ 292968 | consumed samples: 1169408 | consumed tokens: 88653824 | elapsed time per iteration (ms): 84790.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 572/ 292968 | consumed samples: 1171456 | consumed tokens: 88834048 | elapsed time per iteration (ms): 81366.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 573/ 292968 | consumed samples: 1173504 | consumed tokens: 89014272 | elapsed time per iteration (ms): 83901.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 574/ 292968 | consumed samples: 1175552 | consumed tokens: 89194496 | elapsed time per iteration (ms): 84895.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 575/ 292968 | consumed samples: 1177600 | consumed tokens: 89374720 | elapsed time per iteration (ms): 82094.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 576/ 292968 | consumed samples: 1179648 | consumed tokens: 89554944 | elapsed time per iteration (ms): 81710.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 577/ 292968 | consumed samples: 1181696 | consumed tokens: 89735168 | elapsed time per iteration (ms): 80939.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 578/ 292968 | consumed samples: 1183744 | consumed tokens: 89915392 | elapsed time per iteration (ms): 80955.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 579/ 292968 | consumed samples: 1185792 | consumed tokens: 90095616 | elapsed time per iteration (ms): 82536.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 580/ 292968 | consumed samples: 1187840 | consumed tokens: 90275840 | elapsed time per iteration (ms): 84037.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 581/ 292968 | consumed samples: 1189888 | consumed tokens: 90472448 | elapsed time per iteration (ms): 77620.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 582/ 292968 | consumed samples: 1191936 | consumed tokens: 90669056 | elapsed time per iteration (ms): 76695.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 583/ 292968 | consumed samples: 1193984 | consumed tokens: 90865664 | elapsed time per iteration (ms): 76622.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 584/ 292968 | consumed samples: 1196032 | consumed tokens: 91062272 | elapsed time per iteration (ms): 77692.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 585/ 292968 | consumed samples: 1198080 | consumed tokens: 91258880 | elapsed time per iteration (ms): 76640.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 586/ 292968 | consumed samples: 1200128 | consumed tokens: 91455488 | elapsed time per iteration (ms): 75695.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 587/ 292968 | consumed samples: 1202176 | consumed tokens: 91652096 | elapsed time per iteration (ms): 76113.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 588/ 292968 | consumed samples: 1204224 | consumed tokens: 91848704 | elapsed time per iteration (ms): 77076.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 589/ 292968 | consumed samples: 1206272 | consumed tokens: 92045312 | elapsed time per iteration (ms): 76684.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 590/ 292968 | consumed samples: 1208320 | consumed tokens: 92241920 | elapsed time per iteration (ms): 75273.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 591/ 292968 | consumed samples: 1210368 | consumed tokens: 92438528 | elapsed time per iteration (ms): 75775.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 592/ 292968 | consumed samples: 1212416 | consumed tokens: 92635136 | elapsed time per iteration (ms): 76554.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 593/ 292968 | consumed samples: 1214464 | consumed tokens: 92831744 | elapsed time per iteration (ms): 75838.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 594/ 292968 | consumed samples: 1216512 | consumed tokens: 93028352 | elapsed time per iteration (ms): 75753.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 595/ 292968 | consumed samples: 1218560 | consumed tokens: 93224960 | elapsed time per iteration (ms): 75736.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 596/ 292968 | consumed samples: 1220608 | consumed tokens: 93421568 | elapsed time per iteration (ms): 77266.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 597/ 292968 | consumed samples: 1222656 | consumed tokens: 93618176 | elapsed time per iteration (ms): 77567.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 598/ 292968 | consumed samples: 1224704 | consumed tokens: 93814784 | elapsed time per iteration (ms): 77067.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 599/ 292968 | consumed samples: 1226752 | consumed tokens: 94011392 | elapsed time per iteration (ms): 75607.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 600/ 292968 | consumed samples: 1228800 | consumed tokens: 94208000 | elapsed time per iteration (ms): 74468.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -saving checkpoint at iteration 600 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-23 10:00:27,727] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/mp_rank_01_model_states.pt -[2021-10-23 10:00:27,965] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/mp_rank_00_model_states.pt -[2021-10-23 10:00:42,396] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-23 10:00:42,437] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-23 10:00:42,469] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-23 10:00:42,505] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-23 10:00:42,523] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-23 10:00:42,550] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-23 10:00:42,596] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-23 10:00:42,629] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-23 10:00:42,691] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-23 10:00:42,692] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-23 10:00:42,705] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-23 10:00:42,708] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-23 10:00:42,711] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-23 10:00:42,723] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-23 10:00:42,738] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-23 10:00:42,748] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-23 10:00:42,764] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-23 10:00:42,767] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-23 10:00:42,852] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-23 10:00:42,853] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-23 10:00:42,862] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-23 10:00:42,882] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-23 10:00:42,973] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-23 10:00:42,990] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-23 10:00:43,021] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-23 10:00:43,022] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-23 10:00:43,038] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-23 10:00:43,040] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-23 10:00:43,076] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-23 10:00:43,111] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-23 10:00:43,181] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-23 10:00:43,196] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-23 10:00:43,217] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-23 10:00:43,218] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-23 10:00:43,264] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-23 10:00:43,444] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-23 10:00:43,449] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-23 10:00:43,469] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-23 10:00:43,501] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-23 10:00:43,523] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-23 10:00:43,538] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-23 10:00:43,583] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-23 10:00:43,584] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-23 10:00:43,595] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-23 10:00:43,597] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-23 10:00:43,598] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-23 10:00:43,599] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-23 10:00:43,616] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-23 10:00:43,618] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-23 10:00:43,618] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-23 10:00:43,636] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-23 10:00:43,661] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-23 10:00:43,663] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-23 10:00:43,689] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-23 10:00:43,718] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-23 10:00:43,719] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-23 10:00:43,720] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-23 10:00:43,733] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-23 10:00:43,735] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-23 10:00:43,739] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-23 10:00:43,752] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-23 10:00:43,808] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-23 10:00:43,808] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-23 10:00:43,810] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-23 10:00:43,816] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-23 10:00:43,849] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-23 10:00:43,863] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-23 10:00:43,884] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-23 10:00:43,896] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-23 10:00:43,897] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-23 10:00:43,907] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-23 10:00:43,908] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-23 10:00:43,913] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-23 10:00:43,933] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-23 10:00:43,954] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-23 10:00:43,957] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-23 10:00:43,960] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-23 10:00:43,991] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-23 10:00:43,993] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-23 10:00:44,004] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-23 10:00:44,013] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-23 10:00:44,016] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-23 10:00:44,021] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-23 10:00:44,022] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-23 10:00:44,028] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-23 10:00:44,045] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-23 10:00:44,069] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-23 10:00:44,101] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-23 10:00:44,119] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-23 10:00:44,126] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-23 10:00:44,138] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-23 10:00:44,221] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-23 10:00:44,262] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-23 10:00:44,300] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-23 10:00:44,321] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-23 10:00:44,383] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-23 10:00:44,388] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-23 10:00:44,448] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-23 10:00:44,472] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-23 10:00:44,487] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-23 10:00:44,497] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-23 10:00:44,586] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-23 10:00:44,680] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-23 10:00:44,722] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-23 10:00:44,761] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-23 10:00:44,776] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-23 10:00:44,792] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-23 10:00:44,868] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-23 10:00:45,002] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-23 10:00:45,030] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-23 10:00:45,104] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-23 10:00:45,252] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-23 10:00:45,622] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-23 10:00:45,705] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-23 10:00:45,803] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-23 10:00:46,508] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-23 10:00:46,686] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-23 10:00:46,855] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-23 10:00:47,302] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-23 10:00:50,109] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-23 10:00:50,723] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-23 10:00:51,585] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-23 10:00:51,847] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-23 10:00:51,940] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-23 10:00:52,492] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-23 10:00:53,079] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-23 10:00:53,159] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-23 10:00:57,324] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_125_optim_states.pt - successfully saved checkpoint at iteration 600 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 32470.79 - iteration 601/ 292968 | consumed samples: 1230848 | consumed tokens: 94404608 | elapsed time per iteration (ms): 105951.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 602/ 292968 | consumed samples: 1232896 | consumed tokens: 94601216 | elapsed time per iteration (ms): 76025.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 603/ 292968 | consumed samples: 1234944 | consumed tokens: 94797824 | elapsed time per iteration (ms): 76607.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 604/ 292968 | consumed samples: 1236992 | consumed tokens: 94994432 | elapsed time per iteration (ms): 76120.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 605/ 292968 | consumed samples: 1239040 | consumed tokens: 95191040 | elapsed time per iteration (ms): 76097.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 606/ 292968 | consumed samples: 1241088 | consumed tokens: 95387648 | elapsed time per iteration (ms): 78050.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 607/ 292968 | consumed samples: 1243136 | consumed tokens: 95584256 | elapsed time per iteration (ms): 78314.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 608/ 292968 | consumed samples: 1245184 | consumed tokens: 95780864 | elapsed time per iteration (ms): 76756.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 609/ 292968 | consumed samples: 1247232 | consumed tokens: 95977472 | elapsed time per iteration (ms): 76982.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 610/ 292968 | consumed samples: 1249280 | consumed tokens: 96174080 | elapsed time per iteration (ms): 75526.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 611/ 292968 | consumed samples: 1251328 | consumed tokens: 96370688 | elapsed time per iteration (ms): 75737.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 612/ 292968 | consumed samples: 1253376 | consumed tokens: 96567296 | elapsed time per iteration (ms): 74792.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 613/ 292968 | consumed samples: 1255424 | consumed tokens: 96763904 | elapsed time per iteration (ms): 75379.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 614/ 292968 | consumed samples: 1257472 | consumed tokens: 96960512 | elapsed time per iteration (ms): 75795.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 615/ 292968 | consumed samples: 1259520 | consumed tokens: 97157120 | elapsed time per iteration (ms): 75346.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 616/ 292968 | consumed samples: 1261568 | consumed tokens: 97353728 | elapsed time per iteration (ms): 74949.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 617/ 292968 | consumed samples: 1263616 | consumed tokens: 97550336 | elapsed time per iteration (ms): 75518.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 618/ 292968 | consumed samples: 1265664 | consumed tokens: 97746944 | elapsed time per iteration (ms): 76544.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 619/ 292968 | consumed samples: 1267712 | consumed tokens: 97943552 | elapsed time per iteration (ms): 74999.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 620/ 292968 | consumed samples: 1269760 | consumed tokens: 98140160 | elapsed time per iteration (ms): 75878.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 621/ 292968 | consumed samples: 1271808 | consumed tokens: 98336768 | elapsed time per iteration (ms): 75615.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 622/ 292968 | consumed samples: 1273856 | consumed tokens: 98533376 | elapsed time per iteration (ms): 75577.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 623/ 292968 | consumed samples: 1275904 | consumed tokens: 98729984 | elapsed time per iteration (ms): 75567.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 624/ 292968 | consumed samples: 1277952 | consumed tokens: 98926592 | elapsed time per iteration (ms): 76559.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 625/ 292968 | consumed samples: 1280000 | consumed tokens: 99123200 | elapsed time per iteration (ms): 76813.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 626/ 292968 | consumed samples: 1282048 | consumed tokens: 99319808 | elapsed time per iteration (ms): 77235.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 627/ 292968 | consumed samples: 1284096 | consumed tokens: 99516416 | elapsed time per iteration (ms): 76246.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 628/ 292968 | consumed samples: 1286144 | consumed tokens: 99713024 | elapsed time per iteration (ms): 75054.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 629/ 292968 | consumed samples: 1288192 | consumed tokens: 99909632 | elapsed time per iteration (ms): 76206.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 630/ 292968 | consumed samples: 1290240 | consumed tokens: 100106240 | elapsed time per iteration (ms): 75499.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 631/ 292968 | consumed samples: 1292288 | consumed tokens: 100302848 | elapsed time per iteration (ms): 75500.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 632/ 292968 | consumed samples: 1294336 | consumed tokens: 100499456 | elapsed time per iteration (ms): 76082.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 633/ 292968 | consumed samples: 1296384 | consumed tokens: 100696064 | elapsed time per iteration (ms): 76847.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 634/ 292968 | consumed samples: 1298432 | consumed tokens: 100892672 | elapsed time per iteration (ms): 78170.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 635/ 292968 | consumed samples: 1300480 | consumed tokens: 101089280 | elapsed time per iteration (ms): 75801.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 636/ 292968 | consumed samples: 1302528 | consumed tokens: 101285888 | elapsed time per iteration (ms): 76083.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 637/ 292968 | consumed samples: 1304576 | consumed tokens: 101482496 | elapsed time per iteration (ms): 75847.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 638/ 292968 | consumed samples: 1306624 | consumed tokens: 101679104 | elapsed time per iteration (ms): 76085.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 639/ 292968 | consumed samples: 1308672 | consumed tokens: 101875712 | elapsed time per iteration (ms): 77045.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 640/ 292968 | consumed samples: 1310720 | consumed tokens: 102072320 | elapsed time per iteration (ms): 76839.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 641/ 292968 | consumed samples: 1312768 | consumed tokens: 102268928 | elapsed time per iteration (ms): 75778.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 642/ 292968 | consumed samples: 1314816 | consumed tokens: 102465536 | elapsed time per iteration (ms): 75239.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 643/ 292968 | consumed samples: 1316864 | consumed tokens: 102662144 | elapsed time per iteration (ms): 76729.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 644/ 292968 | consumed samples: 1318912 | consumed tokens: 102858752 | elapsed time per iteration (ms): 75601.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 645/ 292968 | consumed samples: 1320960 | consumed tokens: 103055360 | elapsed time per iteration (ms): 75752.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 646/ 292968 | consumed samples: 1323008 | consumed tokens: 103251968 | elapsed time per iteration (ms): 75266.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 647/ 292968 | consumed samples: 1325056 | consumed tokens: 103448576 | elapsed time per iteration (ms): 76548.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 648/ 292968 | consumed samples: 1327104 | consumed tokens: 103645184 | elapsed time per iteration (ms): 76670.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 649/ 292968 | consumed samples: 1329152 | consumed tokens: 103841792 | elapsed time per iteration (ms): 76798.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 650/ 292968 | consumed samples: 1331200 | consumed tokens: 104038400 | elapsed time per iteration (ms): 76609.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 651/ 292968 | consumed samples: 1333248 | consumed tokens: 104235008 | elapsed time per iteration (ms): 75365.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 652/ 292968 | consumed samples: 1335296 | consumed tokens: 104431616 | elapsed time per iteration (ms): 75796.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 653/ 292968 | consumed samples: 1337344 | consumed tokens: 104628224 | elapsed time per iteration (ms): 75583.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 654/ 292968 | consumed samples: 1339392 | consumed tokens: 104824832 | elapsed time per iteration (ms): 77680.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 655/ 292968 | consumed samples: 1341440 | consumed tokens: 105021440 | elapsed time per iteration (ms): 75966.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 656/ 292968 | consumed samples: 1343488 | consumed tokens: 105218048 | elapsed time per iteration (ms): 76217.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 657/ 292968 | consumed samples: 1345536 | consumed tokens: 105414656 | elapsed time per iteration (ms): 75439.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 658/ 292968 | consumed samples: 1347584 | consumed tokens: 105611264 | elapsed time per iteration (ms): 75628.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 659/ 292968 | consumed samples: 1349632 | consumed tokens: 105807872 | elapsed time per iteration (ms): 75673.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 660/ 292968 | consumed samples: 1351680 | consumed tokens: 106004480 | elapsed time per iteration (ms): 77027.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 661/ 292968 | consumed samples: 1353728 | consumed tokens: 106201088 | elapsed time per iteration (ms): 79034.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 662/ 292968 | consumed samples: 1355776 | consumed tokens: 106397696 | elapsed time per iteration (ms): 77583.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 663/ 292968 | consumed samples: 1357824 | consumed tokens: 106594304 | elapsed time per iteration (ms): 75869.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 664/ 292968 | consumed samples: 1359872 | consumed tokens: 106790912 | elapsed time per iteration (ms): 75383.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 665/ 292968 | consumed samples: 1361920 | consumed tokens: 106987520 | elapsed time per iteration (ms): 75894.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 666/ 292968 | consumed samples: 1363968 | consumed tokens: 107184128 | elapsed time per iteration (ms): 76825.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 667/ 292968 | consumed samples: 1366016 | consumed tokens: 107380736 | elapsed time per iteration (ms): 75277.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 668/ 292968 | consumed samples: 1368064 | consumed tokens: 107577344 | elapsed time per iteration (ms): 74962.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 669/ 292968 | consumed samples: 1370112 | consumed tokens: 107773952 | elapsed time per iteration (ms): 77627.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 670/ 292968 | consumed samples: 1372160 | consumed tokens: 107970560 | elapsed time per iteration (ms): 77889.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 671/ 292968 | consumed samples: 1374208 | consumed tokens: 108167168 | elapsed time per iteration (ms): 76639.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 672/ 292968 | consumed samples: 1376256 | consumed tokens: 108363776 | elapsed time per iteration (ms): 75677.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 673/ 292968 | consumed samples: 1378304 | consumed tokens: 108560384 | elapsed time per iteration (ms): 76553.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 674/ 292968 | consumed samples: 1380352 | consumed tokens: 108756992 | elapsed time per iteration (ms): 76026.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 675/ 292968 | consumed samples: 1382400 | consumed tokens: 108953600 | elapsed time per iteration (ms): 75590.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 676/ 292968 | consumed samples: 1384448 | consumed tokens: 109150208 | elapsed time per iteration (ms): 75609.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 677/ 292968 | consumed samples: 1386496 | consumed tokens: 109346816 | elapsed time per iteration (ms): 75151.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 678/ 292968 | consumed samples: 1388544 | consumed tokens: 109543424 | elapsed time per iteration (ms): 75600.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 679/ 292968 | consumed samples: 1390592 | consumed tokens: 109740032 | elapsed time per iteration (ms): 76321.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 680/ 292968 | consumed samples: 1392640 | consumed tokens: 109936640 | elapsed time per iteration (ms): 76596.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 681/ 292968 | consumed samples: 1394688 | consumed tokens: 110133248 | elapsed time per iteration (ms): 74699.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 682/ 292968 | consumed samples: 1396736 | consumed tokens: 110329856 | elapsed time per iteration (ms): 76971.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 683/ 292968 | consumed samples: 1398784 | consumed tokens: 110526464 | elapsed time per iteration (ms): 75437.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 684/ 292968 | consumed samples: 1400832 | consumed tokens: 110723072 | elapsed time per iteration (ms): 77129.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 685/ 292968 | consumed samples: 1402880 | consumed tokens: 110919680 | elapsed time per iteration (ms): 76671.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 686/ 292968 | consumed samples: 1404928 | consumed tokens: 111116288 | elapsed time per iteration (ms): 76006.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 687/ 292968 | consumed samples: 1406976 | consumed tokens: 111312896 | elapsed time per iteration (ms): 76657.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 688/ 292968 | consumed samples: 1409024 | consumed tokens: 111509504 | elapsed time per iteration (ms): 75831.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 689/ 292968 | consumed samples: 1411072 | consumed tokens: 111706112 | elapsed time per iteration (ms): 76089.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 690/ 292968 | consumed samples: 1413120 | consumed tokens: 111902720 | elapsed time per iteration (ms): 76356.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 691/ 292968 | consumed samples: 1415168 | consumed tokens: 112099328 | elapsed time per iteration (ms): 77592.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 692/ 292968 | consumed samples: 1417216 | consumed tokens: 112295936 | elapsed time per iteration (ms): 79668.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 693/ 292968 | consumed samples: 1419264 | consumed tokens: 112492544 | elapsed time per iteration (ms): 76034.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 694/ 292968 | consumed samples: 1421312 | consumed tokens: 112689152 | elapsed time per iteration (ms): 75553.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 695/ 292968 | consumed samples: 1423360 | consumed tokens: 112885760 | elapsed time per iteration (ms): 76585.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 696/ 292968 | consumed samples: 1425408 | consumed tokens: 113082368 | elapsed time per iteration (ms): 77768.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 697/ 292968 | consumed samples: 1427456 | consumed tokens: 113278976 | elapsed time per iteration (ms): 78986.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 698/ 292968 | consumed samples: 1429504 | consumed tokens: 113475584 | elapsed time per iteration (ms): 75299.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 699/ 292968 | consumed samples: 1431552 | consumed tokens: 113672192 | elapsed time per iteration (ms): 76113.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 700/ 292968 | consumed samples: 1433600 | consumed tokens: 113868800 | elapsed time per iteration (ms): 75831.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 701/ 292968 | consumed samples: 1435648 | consumed tokens: 114065408 | elapsed time per iteration (ms): 77954.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 702/ 292968 | consumed samples: 1437696 | consumed tokens: 114262016 | elapsed time per iteration (ms): 76860.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 703/ 292968 | consumed samples: 1439744 | consumed tokens: 114458624 | elapsed time per iteration (ms): 77549.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 704/ 292968 | consumed samples: 1441792 | consumed tokens: 114655232 | elapsed time per iteration (ms): 76086.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 705/ 292968 | consumed samples: 1443840 | consumed tokens: 114851840 | elapsed time per iteration (ms): 75728.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 706/ 292968 | consumed samples: 1445888 | consumed tokens: 115048448 | elapsed time per iteration (ms): 77004.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 707/ 292968 | consumed samples: 1447936 | consumed tokens: 115245056 | elapsed time per iteration (ms): 75610.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 708/ 292968 | consumed samples: 1449984 | consumed tokens: 115441664 | elapsed time per iteration (ms): 76005.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 709/ 292968 | consumed samples: 1452032 | consumed tokens: 115638272 | elapsed time per iteration (ms): 74977.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 710/ 292968 | consumed samples: 1454080 | consumed tokens: 115834880 | elapsed time per iteration (ms): 77453.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 711/ 292968 | consumed samples: 1456128 | consumed tokens: 116031488 | elapsed time per iteration (ms): 74366.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 712/ 292968 | consumed samples: 1458176 | consumed tokens: 116228096 | elapsed time per iteration (ms): 74400.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 713/ 292968 | consumed samples: 1460224 | consumed tokens: 116424704 | elapsed time per iteration (ms): 75045.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 714/ 292968 | consumed samples: 1462272 | consumed tokens: 116621312 | elapsed time per iteration (ms): 75912.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 715/ 292968 | consumed samples: 1464320 | consumed tokens: 116817920 | elapsed time per iteration (ms): 75331.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 716/ 292968 | consumed samples: 1466368 | consumed tokens: 117014528 | elapsed time per iteration (ms): 74867.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 717/ 292968 | consumed samples: 1468416 | consumed tokens: 117211136 | elapsed time per iteration (ms): 76188.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 718/ 292968 | consumed samples: 1470464 | consumed tokens: 117407744 | elapsed time per iteration (ms): 75181.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 719/ 292968 | consumed samples: 1472512 | consumed tokens: 117604352 | elapsed time per iteration (ms): 75603.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 720/ 292968 | consumed samples: 1474560 | consumed tokens: 117800960 | elapsed time per iteration (ms): 77618.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 721/ 292968 | consumed samples: 1476608 | consumed tokens: 117997568 | elapsed time per iteration (ms): 76350.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 722/ 292968 | consumed samples: 1478656 | consumed tokens: 118194176 | elapsed time per iteration (ms): 75529.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 723/ 292968 | consumed samples: 1480704 | consumed tokens: 118390784 | elapsed time per iteration (ms): 76634.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 724/ 292968 | consumed samples: 1482752 | consumed tokens: 118587392 | elapsed time per iteration (ms): 76610.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 725/ 292968 | consumed samples: 1484800 | consumed tokens: 118784000 | elapsed time per iteration (ms): 76137.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 726/ 292968 | consumed samples: 1486848 | consumed tokens: 118996992 | elapsed time per iteration (ms): 78329.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 727/ 292968 | consumed samples: 1488896 | consumed tokens: 119209984 | elapsed time per iteration (ms): 79337.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 728/ 292968 | consumed samples: 1490944 | consumed tokens: 119422976 | elapsed time per iteration (ms): 77771.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 729/ 292968 | consumed samples: 1492992 | consumed tokens: 119635968 | elapsed time per iteration (ms): 79374.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 730/ 292968 | consumed samples: 1495040 | consumed tokens: 119848960 | elapsed time per iteration (ms): 78461.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 731/ 292968 | consumed samples: 1497088 | consumed tokens: 120061952 | elapsed time per iteration (ms): 78942.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 732/ 292968 | consumed samples: 1499136 | consumed tokens: 120274944 | elapsed time per iteration (ms): 79955.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 733/ 292968 | consumed samples: 1501184 | consumed tokens: 120487936 | elapsed time per iteration (ms): 79427.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 734/ 292968 | consumed samples: 1503232 | consumed tokens: 120700928 | elapsed time per iteration (ms): 79713.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 735/ 292968 | consumed samples: 1505280 | consumed tokens: 120913920 | elapsed time per iteration (ms): 77863.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 736/ 292968 | consumed samples: 1507328 | consumed tokens: 121126912 | elapsed time per iteration (ms): 78405.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 737/ 292968 | consumed samples: 1509376 | consumed tokens: 121339904 | elapsed time per iteration (ms): 78191.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 738/ 292968 | consumed samples: 1511424 | consumed tokens: 121552896 | elapsed time per iteration (ms): 77427.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 739/ 292968 | consumed samples: 1513472 | consumed tokens: 121765888 | elapsed time per iteration (ms): 77339.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 740/ 292968 | consumed samples: 1515520 | consumed tokens: 121978880 | elapsed time per iteration (ms): 77282.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 741/ 292968 | consumed samples: 1517568 | consumed tokens: 122191872 | elapsed time per iteration (ms): 78543.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 742/ 292968 | consumed samples: 1519616 | consumed tokens: 122404864 | elapsed time per iteration (ms): 78583.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 743/ 292968 | consumed samples: 1521664 | consumed tokens: 122617856 | elapsed time per iteration (ms): 77734.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 744/ 292968 | consumed samples: 1523712 | consumed tokens: 122830848 | elapsed time per iteration (ms): 78005.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 745/ 292968 | consumed samples: 1525760 | consumed tokens: 123043840 | elapsed time per iteration (ms): 78154.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 746/ 292968 | consumed samples: 1527808 | consumed tokens: 123256832 | elapsed time per iteration (ms): 79098.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 747/ 292968 | consumed samples: 1529856 | consumed tokens: 123469824 | elapsed time per iteration (ms): 76901.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 748/ 292968 | consumed samples: 1531904 | consumed tokens: 123682816 | elapsed time per iteration (ms): 78364.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 749/ 292968 | consumed samples: 1533952 | consumed tokens: 123895808 | elapsed time per iteration (ms): 77745.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 750/ 292968 | consumed samples: 1536000 | consumed tokens: 124108800 | elapsed time per iteration (ms): 76993.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 751/ 292968 | consumed samples: 1538048 | consumed tokens: 124321792 | elapsed time per iteration (ms): 78065.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 752/ 292968 | consumed samples: 1540096 | consumed tokens: 124534784 | elapsed time per iteration (ms): 78716.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 753/ 292968 | consumed samples: 1542144 | consumed tokens: 124747776 | elapsed time per iteration (ms): 78297.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 754/ 292968 | consumed samples: 1544192 | consumed tokens: 124960768 | elapsed time per iteration (ms): 81533.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 755/ 292968 | consumed samples: 1546240 | consumed tokens: 125173760 | elapsed time per iteration (ms): 77260.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 756/ 292968 | consumed samples: 1548288 | consumed tokens: 125386752 | elapsed time per iteration (ms): 77380.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 757/ 292968 | consumed samples: 1550336 | consumed tokens: 125599744 | elapsed time per iteration (ms): 78639.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 758/ 292968 | consumed samples: 1552384 | consumed tokens: 125812736 | elapsed time per iteration (ms): 78547.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 759/ 292968 | consumed samples: 1554432 | consumed tokens: 126025728 | elapsed time per iteration (ms): 78637.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 760/ 292968 | consumed samples: 1556480 | consumed tokens: 126238720 | elapsed time per iteration (ms): 76681.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 761/ 292968 | consumed samples: 1558528 | consumed tokens: 126451712 | elapsed time per iteration (ms): 78835.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 762/ 292968 | consumed samples: 1560576 | consumed tokens: 126664704 | elapsed time per iteration (ms): 78476.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 763/ 292968 | consumed samples: 1562624 | consumed tokens: 126877696 | elapsed time per iteration (ms): 80815.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 764/ 292968 | consumed samples: 1564672 | consumed tokens: 127090688 | elapsed time per iteration (ms): 78990.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 765/ 292968 | consumed samples: 1566720 | consumed tokens: 127303680 | elapsed time per iteration (ms): 76814.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 766/ 292968 | consumed samples: 1568768 | consumed tokens: 127516672 | elapsed time per iteration (ms): 77218.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 767/ 292968 | consumed samples: 1570816 | consumed tokens: 127729664 | elapsed time per iteration (ms): 77724.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 768/ 292968 | consumed samples: 1572864 | consumed tokens: 127942656 | elapsed time per iteration (ms): 79202.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 769/ 292968 | consumed samples: 1574912 | consumed tokens: 128155648 | elapsed time per iteration (ms): 78713.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 770/ 292968 | consumed samples: 1576960 | consumed tokens: 128368640 | elapsed time per iteration (ms): 78768.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 771/ 292968 | consumed samples: 1579008 | consumed tokens: 128581632 | elapsed time per iteration (ms): 77027.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 772/ 292968 | consumed samples: 1581056 | consumed tokens: 128794624 | elapsed time per iteration (ms): 77694.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 773/ 292968 | consumed samples: 1583104 | consumed tokens: 129007616 | elapsed time per iteration (ms): 78285.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 774/ 292968 | consumed samples: 1585152 | consumed tokens: 129220608 | elapsed time per iteration (ms): 77768.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 775/ 292968 | consumed samples: 1587200 | consumed tokens: 129433600 | elapsed time per iteration (ms): 78751.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 776/ 292968 | consumed samples: 1589248 | consumed tokens: 129646592 | elapsed time per iteration (ms): 78528.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 777/ 292968 | consumed samples: 1591296 | consumed tokens: 129859584 | elapsed time per iteration (ms): 78682.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 778/ 292968 | consumed samples: 1593344 | consumed tokens: 130072576 | elapsed time per iteration (ms): 77272.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 779/ 292968 | consumed samples: 1595392 | consumed tokens: 130285568 | elapsed time per iteration (ms): 80038.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 780/ 292968 | consumed samples: 1597440 | consumed tokens: 130498560 | elapsed time per iteration (ms): 77708.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 781/ 292968 | consumed samples: 1599488 | consumed tokens: 130711552 | elapsed time per iteration (ms): 77785.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 782/ 292968 | consumed samples: 1601536 | consumed tokens: 130924544 | elapsed time per iteration (ms): 77721.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 783/ 292968 | consumed samples: 1603584 | consumed tokens: 131137536 | elapsed time per iteration (ms): 78420.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 784/ 292968 | consumed samples: 1605632 | consumed tokens: 131350528 | elapsed time per iteration (ms): 78087.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 785/ 292968 | consumed samples: 1607680 | consumed tokens: 131563520 | elapsed time per iteration (ms): 79958.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 786/ 292968 | consumed samples: 1609728 | consumed tokens: 131776512 | elapsed time per iteration (ms): 78833.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 787/ 292968 | consumed samples: 1611776 | consumed tokens: 131989504 | elapsed time per iteration (ms): 76965.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 788/ 292968 | consumed samples: 1613824 | consumed tokens: 132202496 | elapsed time per iteration (ms): 77924.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 789/ 292968 | consumed samples: 1615872 | consumed tokens: 132415488 | elapsed time per iteration (ms): 78840.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 790/ 292968 | consumed samples: 1617920 | consumed tokens: 132628480 | elapsed time per iteration (ms): 77402.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 791/ 292968 | consumed samples: 1619968 | consumed tokens: 132841472 | elapsed time per iteration (ms): 78261.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 792/ 292968 | consumed samples: 1622016 | consumed tokens: 133054464 | elapsed time per iteration (ms): 80176.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 793/ 292968 | consumed samples: 1624064 | consumed tokens: 133267456 | elapsed time per iteration (ms): 79974.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 794/ 292968 | consumed samples: 1626112 | consumed tokens: 133480448 | elapsed time per iteration (ms): 77972.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 795/ 292968 | consumed samples: 1628160 | consumed tokens: 133693440 | elapsed time per iteration (ms): 78413.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 796/ 292968 | consumed samples: 1630208 | consumed tokens: 133906432 | elapsed time per iteration (ms): 79004.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 797/ 292968 | consumed samples: 1632256 | consumed tokens: 134119424 | elapsed time per iteration (ms): 76848.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 798/ 292968 | consumed samples: 1634304 | consumed tokens: 134332416 | elapsed time per iteration (ms): 78243.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 799/ 292968 | consumed samples: 1636352 | consumed tokens: 134545408 | elapsed time per iteration (ms): 79156.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 800/ 292968 | consumed samples: 1638400 | consumed tokens: 134758400 | elapsed time per iteration (ms): 77568.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 801/ 292968 | consumed samples: 1640448 | consumed tokens: 134971392 | elapsed time per iteration (ms): 78323.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 802/ 292968 | consumed samples: 1642496 | consumed tokens: 135184384 | elapsed time per iteration (ms): 78633.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 803/ 292968 | consumed samples: 1644544 | consumed tokens: 135397376 | elapsed time per iteration (ms): 78813.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 804/ 292968 | consumed samples: 1646592 | consumed tokens: 135610368 | elapsed time per iteration (ms): 78171.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 805/ 292968 | consumed samples: 1648640 | consumed tokens: 135823360 | elapsed time per iteration (ms): 77535.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 806/ 292968 | consumed samples: 1650688 | consumed tokens: 136036352 | elapsed time per iteration (ms): 76979.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 807/ 292968 | consumed samples: 1652736 | consumed tokens: 136249344 | elapsed time per iteration (ms): 79204.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 808/ 292968 | consumed samples: 1654784 | consumed tokens: 136462336 | elapsed time per iteration (ms): 77025.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 809/ 292968 | consumed samples: 1656832 | consumed tokens: 136675328 | elapsed time per iteration (ms): 77032.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 810/ 292968 | consumed samples: 1658880 | consumed tokens: 136888320 | elapsed time per iteration (ms): 78530.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 811/ 292968 | consumed samples: 1660928 | consumed tokens: 137101312 | elapsed time per iteration (ms): 78796.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 812/ 292968 | consumed samples: 1662976 | consumed tokens: 137314304 | elapsed time per iteration (ms): 76478.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 813/ 292968 | consumed samples: 1665024 | consumed tokens: 137527296 | elapsed time per iteration (ms): 78875.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 814/ 292968 | consumed samples: 1667072 | consumed tokens: 137740288 | elapsed time per iteration (ms): 77038.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 815/ 292968 | consumed samples: 1669120 | consumed tokens: 137953280 | elapsed time per iteration (ms): 78966.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 816/ 292968 | consumed samples: 1671168 | consumed tokens: 138166272 | elapsed time per iteration (ms): 78271.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 817/ 292968 | consumed samples: 1673216 | consumed tokens: 138379264 | elapsed time per iteration (ms): 78760.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 818/ 292968 | consumed samples: 1675264 | consumed tokens: 138592256 | elapsed time per iteration (ms): 80164.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 819/ 292968 | consumed samples: 1677312 | consumed tokens: 138805248 | elapsed time per iteration (ms): 78758.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 820/ 292968 | consumed samples: 1679360 | consumed tokens: 139018240 | elapsed time per iteration (ms): 80404.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 821/ 292968 | consumed samples: 1681408 | consumed tokens: 139231232 | elapsed time per iteration (ms): 77913.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 822/ 292968 | consumed samples: 1683456 | consumed tokens: 139444224 | elapsed time per iteration (ms): 77540.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 823/ 292968 | consumed samples: 1685504 | consumed tokens: 139657216 | elapsed time per iteration (ms): 76602.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 824/ 292968 | consumed samples: 1687552 | consumed tokens: 139870208 | elapsed time per iteration (ms): 77871.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 825/ 292968 | consumed samples: 1689600 | consumed tokens: 140083200 | elapsed time per iteration (ms): 81554.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 826/ 292968 | consumed samples: 1691648 | consumed tokens: 140296192 | elapsed time per iteration (ms): 77593.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 827/ 292968 | consumed samples: 1693696 | consumed tokens: 140509184 | elapsed time per iteration (ms): 76966.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 828/ 292968 | consumed samples: 1695744 | consumed tokens: 140722176 | elapsed time per iteration (ms): 78500.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 829/ 292968 | consumed samples: 1697792 | consumed tokens: 140935168 | elapsed time per iteration (ms): 78281.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 830/ 292968 | consumed samples: 1699840 | consumed tokens: 141148160 | elapsed time per iteration (ms): 76785.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 831/ 292968 | consumed samples: 1701888 | consumed tokens: 141361152 | elapsed time per iteration (ms): 78291.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 832/ 292968 | consumed samples: 1703936 | consumed tokens: 141574144 | elapsed time per iteration (ms): 77150.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 833/ 292968 | consumed samples: 1705984 | consumed tokens: 141787136 | elapsed time per iteration (ms): 79163.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 834/ 292968 | consumed samples: 1708032 | consumed tokens: 142000128 | elapsed time per iteration (ms): 80157.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 835/ 292968 | consumed samples: 1710080 | consumed tokens: 142213120 | elapsed time per iteration (ms): 78440.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 836/ 292968 | consumed samples: 1712128 | consumed tokens: 142426112 | elapsed time per iteration (ms): 76862.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 837/ 292968 | consumed samples: 1714176 | consumed tokens: 142639104 | elapsed time per iteration (ms): 78281.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 838/ 292968 | consumed samples: 1716224 | consumed tokens: 142852096 | elapsed time per iteration (ms): 78619.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 839/ 292968 | consumed samples: 1718272 | consumed tokens: 143065088 | elapsed time per iteration (ms): 78310.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 840/ 292968 | consumed samples: 1720320 | consumed tokens: 143278080 | elapsed time per iteration (ms): 78428.3 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 841/ 292968 | consumed samples: 1722368 | consumed tokens: 143491072 | elapsed time per iteration (ms): 78459.9 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 842/ 292968 | consumed samples: 1724416 | consumed tokens: 143704064 | elapsed time per iteration (ms): 79007.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 843/ 292968 | consumed samples: 1726464 | consumed tokens: 143917056 | elapsed time per iteration (ms): 78188.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 844/ 292968 | consumed samples: 1728512 | consumed tokens: 144130048 | elapsed time per iteration (ms): 79792.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 845/ 292968 | consumed samples: 1730560 | consumed tokens: 144343040 | elapsed time per iteration (ms): 79053.4 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 846/ 292968 | consumed samples: 1732608 | consumed tokens: 144556032 | elapsed time per iteration (ms): 77709.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 847/ 292968 | consumed samples: 1734656 | consumed tokens: 144769024 | elapsed time per iteration (ms): 77030.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 848/ 292968 | consumed samples: 1736704 | consumed tokens: 144982016 | elapsed time per iteration (ms): 78480.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 849/ 292968 | consumed samples: 1738752 | consumed tokens: 145195008 | elapsed time per iteration (ms): 79274.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 850/ 292968 | consumed samples: 1740800 | consumed tokens: 145408000 | elapsed time per iteration (ms): 78104.5 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 851/ 292968 | consumed samples: 1742848 | consumed tokens: 145620992 | elapsed time per iteration (ms): 78348.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 852/ 292968 | consumed samples: 1744896 | consumed tokens: 145833984 | elapsed time per iteration (ms): 78993.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 853/ 292968 | consumed samples: 1746944 | consumed tokens: 146046976 | elapsed time per iteration (ms): 78849.0 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 854/ 292968 | consumed samples: 1748992 | consumed tokens: 146259968 | elapsed time per iteration (ms): 78395.6 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 855/ 292968 | consumed samples: 1751040 | consumed tokens: 146472960 | elapsed time per iteration (ms): 77359.8 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 856/ 292968 | consumed samples: 1753088 | consumed tokens: 146685952 | elapsed time per iteration (ms): 79532.1 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 857/ 292968 | consumed samples: 1755136 | consumed tokens: 146898944 | elapsed time per iteration (ms): 77728.7 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 858/ 292968 | consumed samples: 1757184 | consumed tokens: 147111936 | elapsed time per iteration (ms): 77179.2 | learning rate: 6.000E-05 | global batch size: 2048 | loss scale: 1.0 | grad norm: 45230.465 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -saving checkpoint at iteration 858 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-23 15:33:21,757] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/mp_rank_00_model_states.pt -[2021-10-23 15:33:21,796] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/mp_rank_01_model_states.pt -[2021-10-23 15:33:34,726] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-23 15:33:34,782] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-23 15:33:34,799] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-23 15:33:34,806] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-23 15:33:34,829] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-23 15:33:34,872] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-23 15:33:34,903] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-23 15:33:34,946] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-23 15:33:34,981] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-23 15:33:35,033] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-23 15:33:35,036] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-23 15:33:35,049] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-23 15:33:35,073] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-23 15:33:35,105] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-23 15:33:35,142] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-23 15:33:35,146] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-23 15:33:35,168] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-23 15:33:35,197] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-23 15:33:35,247] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-23 15:33:35,249] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-23 15:33:35,292] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-23 15:33:35,317] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-23 15:33:35,338] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-23 15:33:35,375] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-23 15:33:35,376] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-23 15:33:35,438] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-23 15:33:35,459] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-23 15:33:35,497] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-23 15:33:35,532] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-23 15:33:35,691] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-23 15:33:35,827] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-23 15:33:35,840] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-23 15:33:35,848] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-23 15:33:35,859] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-23 15:33:35,867] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-23 15:33:35,909] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-23 15:33:35,933] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-23 15:33:35,947] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-23 15:33:35,949] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-23 15:33:35,950] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-23 15:33:35,967] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-23 15:33:35,984] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-23 15:33:35,993] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-23 15:33:35,997] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-23 15:33:36,008] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-23 15:33:36,015] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-23 15:33:36,016] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-23 15:33:36,045] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-23 15:33:36,066] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-23 15:33:36,066] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-23 15:33:36,074] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-23 15:33:36,088] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-23 15:33:36,096] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-23 15:33:36,103] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-23 15:33:36,104] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-23 15:33:36,106] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-23 15:33:36,112] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-23 15:33:36,138] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-23 15:33:36,138] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-23 15:33:36,157] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-23 15:33:36,160] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-23 15:33:36,167] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-23 15:33:36,175] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-23 15:33:36,205] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-23 15:33:36,234] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-23 15:33:36,235] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-23 15:33:36,246] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-23 15:33:36,287] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-23 15:33:36,288] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-23 15:33:36,289] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-23 15:33:36,292] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-23 15:33:36,303] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-23 15:33:36,311] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-23 15:33:36,312] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-23 15:33:36,325] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-23 15:33:36,331] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-23 15:33:36,338] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-23 15:33:36,345] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-23 15:33:36,379] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-23 15:33:36,411] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-23 15:33:36,411] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-23 15:33:36,416] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-23 15:33:36,424] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-23 15:33:36,466] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-23 15:33:36,501] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-23 15:33:36,522] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-23 15:33:36,539] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-23 15:33:36,548] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-23 15:33:36,549] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-23 15:33:36,559] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-23 15:33:36,572] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-23 15:33:36,576] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-23 15:33:36,642] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-23 15:33:36,686] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-23 15:33:36,700] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-23 15:33:36,726] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-23 15:33:36,823] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-23 15:33:36,840] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-23 15:33:36,850] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-23 15:33:36,942] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-23 15:33:37,006] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-23 15:33:37,121] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-23 15:33:37,131] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-23 15:33:37,157] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-23 15:33:37,172] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-23 15:33:37,186] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-23 15:33:37,187] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-23 15:33:37,191] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-23 15:33:37,312] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-23 15:33:37,348] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-23 15:33:37,349] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-23 15:33:37,735] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-23 15:33:38,006] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-23 15:33:38,182] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-23 15:33:38,416] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-23 15:33:38,831] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-23 15:33:38,958] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-23 15:33:38,963] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-23 15:33:39,107] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-23 15:33:39,245] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-23 15:33:43,374] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-23 15:33:43,966] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-23 15:33:44,271] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-23 15:33:44,517] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-23 15:33:45,450] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-23 15:33:45,558] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-23 15:33:45,629] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-23 15:33:45,748] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step858/zero_pp_rank_0_mp_rank_125_optim_states.pt - successfully saved checkpoint at iteration 858 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 26911.10 -[exiting program after 1190.9138893206914 minutes] datetime: 2021-10-23 15:33:45 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ............ninjaninja............ [NO] .................. ..................[NO]....... [OKAY] [OKAY].......[OKAY] - - - --------------------------------------------------[OKAY]-------------------------------------------------- - -transformerop name - transformer ............ op name ................ ............[NO] ................ installed [NO] installed....... ........... [OKAY]compatiblecompatible -[OKAY]-------------------------------------------------- - - - ---------------------------------------------------stochastic_transformer - stochastic_transformer . .[NO] cpu_adam [NO] ...................... cpu_adam ....... [OKAY]............... [NO] -[NO] [OKAY] .............. - [OKAY] -[OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformer transformer............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] ....... .......[OKAY] -[OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -transformer ............transformer [NO]............ .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer stochastic_transformer . [NO]. .......[NO] [OKAY]....... - [OKAY] -ninjaninja .................................... [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- -op nameop name ................................ installedinstalled .... compatiblecompatible - --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam............... ...............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb .............fused_lamb [NO]............. .......[NO] [OKAY]....... - [OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY] -[OKAY] -transformer transformer............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer . [NO]. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja ..................ninja [OKAY] -.................. --------------------------------------------------[OKAY] - -op name --------------------------------------------------................ - installedop name .................. compatibleinstalled - --------------------------------------------------.. - compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... cpu_adam[OKAY] -............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_adam ............. [NO]fused_lamb .................... [NO][OKAY] -....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn ............transformer [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformer ............ [NO]stochastic_transformer ....... [OKAY]. - [NO] ....... [OKAY]stochastic_transformer - . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name sparse_attn................ ............installed ..[NO] compatible....... - -------------------------------------------------- -[OKAY] -transformer ............ [NO] .......cpu_adam ...............[OKAY] -[NO] .......stochastic_transformer [OKAY] -. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -transformer ............ [NO] ....... [OKAY] -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatibleninja - --------------------------------------------------.................. - [OKAY] --------------------------------------------------- -op name cpu_adam................ ...............installed [NO].. .......compatible -[OKAY] --------------------------------------------------- -cpu_adam fused_adam............... .............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -fused_lamb ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -JIT compiled ops requires ninja -fused_lamb ............. ninja[NO]sparse_attnninja .................. ..................................... [OKAY][NO][OKAY] - [OKAY] -....... - --------------------------------------------------[OKAY]-------------------------------------------------- - - -op nametransformerop name sparse_attn................ ............ ............................ installed[NO] installed.........[NO] [OKAY] ..compatible....... - -compatible[OKAY] --------------------------------------------------- -stochastic_transformer --------------------------------------------------- transformer - ............. [NO][NO] .............. cpu_adam[OKAY][OKAY]cpu_adam - -.............................. [NO][NO] stochastic_transformer ....... ....... [OKAY].[OKAY] - -[NO] ....... [OKAY] -fused_adam fused_adam............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformer transformer............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ninja............ [NO] ......................... [OKAY]ninja[OKAY] - -.................. transformer--------------------------------------------------[OKAY] - -............-------------------------------------------------- -[NO]op nameninja op name ....... ................ ..................................[OKAY] installed - [OKAY] installed -.. stochastic_transformer .. compatible-------------------------------------------------- -compatible. - ---------------------------------------------------[NO]op name-------------------------------------------------- - -....................... [OKAY] -installed .. compatiblecpu_adamcpu_adam - --------------------------------------------------.............................. - [NO][NO] ....... .......cpu_adam[OKAY] -[OKAY]............... - [NO] ....... [OKAY] -fused_adamfused_adam ............. .............fused_adam[NO] [NO].................... .......[NO][OKAY] -[OKAY]....... - [OKAY] -fused_lamb fused_lamb.............fused_lamb [NO].......................... ....... [NO] [NO][OKAY]....... - .......[OKAY] -[OKAY] -sparse_attn sparse_attn............sparse_attn ............[NO]............ [NO].......[NO] [OKAY].............. - [OKAY][OKAY] -transformer - ............transformertransformer [NO]........................ .......[NO][NO] [OKAY].............. - [OKAY][OKAY]stochastic_transformer - - .stochastic_transformer stochastic_transformer [NO] . ........[NO] [OKAY][NO] -....... .......[OKAY] -[OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO]ninja ....... [OKAY].................. - [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -sparse_attn ............ [NO] ....... [OKAY] -cpu_adam ...............transformer [NO]............ .......[NO] .......[OKAY] -[OKAY] -stochastic_transformer . [NO] .......fused_adam [OKAY]............. - [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformerninja ............ [NO].................. ....... [OKAY][OKAY] - --------------------------------------------------- -stochastic_transformer op name ................. [NO]installed ....... ..[OKAY] -compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible -ninja-------------------------------------------------- - .................. [OKAY]ninja - --------------------------------------------------cpu_adam.................. - ...............op name[OKAY] -[NO]................ --------------------------------------------------....... - installed[OKAY]op name -.................. compatibleinstalled - ..-------------------------------------------------- -compatible --------------------------------------------------- -fused_adam cpu_adam............. [NO]cpu_adam............... ......................[NO] [OKAY][NO]....... - .......[OKAY] -[OKAY]fused_lamb - ............. [NO] ....... [OKAY] -fused_adam .............fused_adam [NO]............. ....... [NO][OKAY] -....... [OKAY]fused_lambsparse_attn - ......................... fused_lamb[NO][NO] .................... ....... [NO][OKAY] - [OKAY]....... -transformer [OKAY]............ - [NO] ....... [OKAY] -sparse_attn stochastic_transformer............ [NO] ........sparse_attn [NO] [OKAY] ............ -....... transformer[NO] [OKAY] ............ -....... [NO][OKAY] -....... [OKAY]transformer - ............ [NO] stochastic_transformer....... [OKAY]. - [NO] ....... stochastic_transformer[OKAY] -. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................. [OKAY].................. - [OKAY] --------------------------------------------------- ---------------------------------------------------op name - ................ op nameinstalledninja ................ .. ..................installedcompatible - [OKAY]--------------------------------------------------.. - - compatible-------------------------------------------------- - ---------------------------------------------------op name - ................ installedcpu_adam .................cpu_adam [NO]...............compatible -.......[NO] --------------------------------------------------[OKAY] - -....... [OKAY] -cpu_adam ............... [NO] fused_adam....... fused_adam ............. [OKAY] ............. -[NO] [NO]....... .......[OKAY] -[OKAY] -fused_adam fused_lamb............. fused_lamb ............. [NO] ............. [NO] .............. [NO] [OKAY] [OKAY]....... - - [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attnsparse_attn ............ ............[NO] [NO].......sparse_attn ...................[OKAY] -[OKAY][NO] - transformer....... ............ transformer [OKAY][NO] - ...................transformer [OKAY][NO] -............ .......[NO] [OKAY].......stochastic_transformer -[OKAY] -stochastic_transformer. [NO] stochastic_transformer........ [OKAY][NO]. - .......[NO] [OKAY]....... - [OKAY] -ninja .................. [OKAY] -ninja-------------------------------------------------- - ..................op name [OKAY]................ -installed --------------------------------------------------.. - compatibleop name - --------------------------------------------------................ - installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_adam ............. fused_lamb[NO] .................... [NO][OKAY] -....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attnninja transformer............ ..............................[NO] [OKAY] [NO] -....... --------------------------------------------------.......[OKAY] - -[OKAY]op name - transformer................ ............installedstochastic_transformer ..[NO] compatible........ - --------------------------------------------------[NO] -[OKAY] -....... [OKAY] -stochastic_transformer .cpu_adam [NO]............... [NO]....... [OKAY]....... -[OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -sparse_attn-------------------------------------------------- -............ NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.[NO] - --------------------------------------------------....... - JIT compiled ops requires ninja[OKAY] - -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - - ---------------------------------------------------JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatibleninja - --------------------------------------------------.................. - [OKAY] --------------------------------------------------- -op name ................ninja cpu_adaminstalled ................................... [NO][OKAY]compatible - -....... --------------------------------------------------[OKAY]-------------------------------------------------- - - -op nameninja .................................. installedcpu_adam[OKAY] -.................fused_adam -------------------------------------------------- compatible -.............[NO] -op name [NO].......................-------------------------------------------------- -installed[OKAY]....... - .. [OKAY]compatible - -cpu_adam --------------------------------------------------...............fused_lamb - [NO].............fused_adam .......[NO]............. .......[NO][OKAY] -[OKAY].......cpu_adam - [OKAY]............... -[NO] ....... fused_lamb[OKAY]fused_adam - .............sparse_attn............. [NO][NO] ............ ....... ....... [NO] [OKAY] [OKAY] -....... - [OKAY] -fused_adamfused_lambtransformer ...................................... [NO][NO][NO] .....................sparse_attn [OKAY] [OKAY]............ - - [OKAY][NO] -stochastic_transformer ....... .[OKAY]fused_lamb - [NO].............sparse_attntransformer ....... [NO] ........................ [OKAY] ....... -[NO] [NO] [OKAY] -.............. [OKAY][OKAY] - -transformer ............stochastic_transformer [NO] ........ [OKAY] -[NO]sparse_attn .......stochastic_transformer............ [OKAY][NO] -........ [NO][OKAY] -....... [OKAY]transformer - ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja cpu_adam.................. ............... [OKAY][NO] - ....... --------------------------------------------------[OKAY] - -op name ................ installed .. fused_adamcompatible -............. --------------------------------------------------[NO] -....... [OKAY] -fused_lamb ............. [NO] .......cpu_adam [OKAY] -............... [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] fused_adam....... [OKAY]............. - [NO] .......stochastic_transformer [OKAY] -. [NO] ....... fused_lamb[OKAY] - ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report------------------------------------------------------------------------------------------------------------------------------------------------------ - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ninja....... [OKAY] -.................. [OKAY] -stochastic_transformer-------------------------------------------------- -.op name [NO]................ .......installed [OKAY] -.. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -ninjaninjatransformer .............................. ..................[OKAY] [NO] - [OKAY]--------------------------------------------------....... - - [OKAY]--------------------------------------------------op name - - ................op name installedstochastic_transformer................ ..installed . compatible ..[NO] - .......compatible-------------------------------------------------- - -[OKAY]-------------------------------------------------- - -cpu_adam ...............cpu_adam [NO]............... .......[NO] [OKAY]....... - [OKAY] -fused_adam ............. [NO]fused_adam .................... [OKAY][NO] - ....... fused_lamb[OKAY] -............. fused_lamb[NO] .................... [NO][OKAY] -....... [OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. cpu_adam[OKAY] ............... -ninjaninja -------------------------------------------------- [NO] .................. - ..................op name ....... [OKAY][OKAY]................ -[OKAY] -installed ----------------------------------------------------------------------------------------------------- -.. -op name op namecompatible................ - ................--------------------------------------------------installed - fused_adam..installed compatible............... - [NO]cpu_adamcompatible --------------------------------------------------............... -....... --------------------------------------------------- [NO] -[OKAY] -....... [OKAY] -fused_lambcpu_adam ............................cpu_adam [NO]fused_adam [NO]............... ....... .................... [NO][OKAY] [NO] - [OKAY] .............. - [OKAY][OKAY] - -fused_lamb ............. sparse_attn[NO] fused_adam............ .......[NO]............. fused_adam [OKAY] [NO]....... -............. [OKAY] ....... -[NO] [OKAY]....... -transformer [OKAY]............fused_lamb - sparse_attn[NO]............. fused_lamb ............ ....... [NO].............[OKAY][NO] -....... [NO] .......stochastic_transformer[OKAY] .......[OKAY] - -.[OKAY] transformer[NO] - ............ .......[NO] [OKAY]....... - [OKAY] -sparse_attn ............stochastic_transformer [NO]sparse_attn. .......[NO]............ [OKAY].......[NO] - [OKAY].......transformer - [OKAY]............ - [NO] transformer....... ............[OKAY] -[NO] .......stochastic_transformer [OKAY] -. [NO] stochastic_transformer....... [OKAY] -. [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... ninja[OKAY] - .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adamninja ninja .................. [OKAY]............... - ..................[NO]-------------------------------------------------- [OKAY]....... - - [OKAY]op name-------------------------------------------------- - -................ op nameinstalled ................ ..installed compatible.. - fused_adam-------------------------------------------------- compatible -............. - [NO]-------------------------------------------------- -....... [OKAY]cpu_adam - ...............cpu_adam fused_lamb [NO] ............................ .......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -fused_adam ............. [NO] fused_adam.......sparse_attn .............[OKAY]............ -[NO][NO] ..............fused_lamb [OKAY][OKAY]............. - - [NO]transformer ...................fused_lamb [NO][OKAY]............. -....... [NO][OKAY] - ....... [OKAY] -stochastic_transformer . [NO] .......sparse_attn [OKAY]............ - sparse_attn[NO] ................... [NO][OKAY] -....... transformer[OKAY] -............ [NO]transformer ................... [OKAY][NO] - ....... [OKAY]stochastic_transformer - .stochastic_transformer [NO] ........ [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled ninja .. ....................compatible -compatible[OKAY] --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -op name ................ installed cpu_adam..cpu_adam compatible.............................. - --------------------------------------------------[NO][NO] - .............. [OKAY][OKAY] - -cpu_adam ............... [NO] ....... [OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_adam fused_lamb.............fused_lamb ............. [NO]............. [NO].......[NO] .......[OKAY]....... - [OKAY][OKAY] - -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY] sparse_attn[OKAY] - -............ [NO] .......transformer transformer [OKAY] ............ -............ [NO]transformer[NO] ................... ....... [NO] [OKAY] -[OKAY]....... - [OKAY]stochastic_transformer -stochastic_transformer .. stochastic_transformer[NO][NO] ............... [OKAY][NO][OKAY] - -....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -ninjaninjaop name .................................................... ninja[OKAY]installed[OKAY] - - --------------------------------------------------....................-------------------------------------------------- - - compatibleop name -op name [OKAY] --------------------------------------------------................ - -................ installed--------------------------------------------------installed - .... op name cpu_adamcompatible compatible -............................... - --------------------------------------------------installed-------------------------------------------------- -[NO] - ......... compatible[OKAY] - --------------------------------------------------- -cpu_adam cpu_adam............... ...............[NO] .......[NO]cpu_adamfused_adam [OKAY].................... -............... [OKAY][NO][NO] -.............. [OKAY][OKAY] - -fused_adam ............. [NO]fused_lamb .................... fused_adam[NO][OKAY] fused_adam ....... .......................... - [NO][OKAY][NO] -fused_lamb.............. .............[OKAY][OKAY] sparse_attn [NO] -............ ....... -[NO]fused_lamb [OKAY]....................fused_lamb - [OKAY] [NO] - ....................transformer [NO][OKAY]............ - sparse_attn.......[NO] ...................[OKAY] -[NO][OKAY] -....... [OKAY] -stochastic_transformersparse_attn transformer............ . ............[NO][NO] ....... [NO].......sparse_attn [OKAY].......[OKAY] -............ [OKAY] - -[NO] .......transformer stochastic_transformer [OKAY] ............ - .[NO]transformer .......[NO]............ [OKAY] -.......[NO] [OKAY]....... - stochastic_transformer[OKAY] -. [NO] stochastic_transformer....... [OKAY]. - [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam ninja..............................ninja ..................[NO] ..................[NO] [OKAY] .......[OKAY]....... - - --------------------------------------------------[OKAY]--------------------------------------------------[OKAY] - - - -op nameop name ................................ installedinstalled .... compatiblecompatible - -fused_adam--------------------------------------------------fused_adam-------------------------------------------------- - -.......................... [NO][NO] .............. [OKAY]cpu_adam -[OKAY]cpu_adam - ...............fused_lamb............... .............[NO]fused_lamb [NO] ....... .................... [NO] [OKAY][OKAY] -[NO] -....... ....... [OKAY][OKAY] - -fused_adam ............. fused_adam[NO] .................... [NO][OKAY] -sparse_attnsparse_attn....... fused_lamb ............ .........................[OKAY] -[NO][NO][NO] fused_lamb ..................... ............. [OKAY] [OKAY] -[OKAY][NO] - - transformer....... ............transformer[OKAY] -............[NO] [NO]....... .......[OKAY] -[OKAY]sparse_attn -............stochastic_transformer [NO] sparse_attn........stochastic_transformer ............ [OKAY] .[NO] -[NO] .......transformer[NO]....... ............[OKAY][OKAY]....... - - [NO][OKAY] -.......transformer [OKAY]............ - [NO] ....... [OKAY]stochastic_transformer - .stochastic_transformer [NO]. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] ..............ninja [OKAY] [OKAY] - -.................. [OKAY]fused_lambfused_lamb - .............--------------------------------------------------............. - [NO][NO] op name .............................. [OKAY]installed[OKAY] - -.. compatible --------------------------------------------------- -cpu_adamsparse_attn ninjasparse_attn........................... [NO]............ .................. [NO] ....... [NO][OKAY][OKAY] ....... - -....... -------------------------------------------------- [OKAY] -[OKAY] -op name - transformer................ transformer............installed fused_adam ..............[NO]............. compatible[NO].......[NO] - -------------------------------------------------- -[OKAY].............. - [OKAY][OKAY] - -stochastic_transformercpu_adam fused_lambstochastic_transformer............... . ............. [NO]. [NO] [NO] ....... [NO] .......[OKAY] .............. -[OKAY] -[OKAY][OKAY] - -fused_adam ............. [NO] sparse_attn....... ............[OKAY] -[NO] ....... fused_lamb[OKAY] -............. [NO] transformer....... [OKAY]............ - [NO] ....... [OKAY] -stochastic_transformer . sparse_attn[NO] ................... [NO][OKAY] -....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. ..................[OKAY] [OKAY] - [OKAY] ---------------------------------------------------[OKAY] - - -----------------------------------------------------------------------------------------------------op name-------------------------------------------------- - - -................op nameop name op name installed .................................. ................ installedinstalledcompatibleinstalled -......-------------------------------------------------- -compatible compatible -compatible - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [NO] .......cpu_adam cpu_adam cpu_adam [OKAY]............... ............... -...............[NO] [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_adam fused_adam.............fused_adam fused_lamb [NO] ............. ................................. [NO] [NO] [OKAY][NO]....... -....... [OKAY].......fused_lamb -[OKAY] - [OKAY]............. - fused_lamb[NO] fused_lamb ............. ....... ............. [NO]sparse_attn[OKAY] -...................[NO] [NO][OKAY]....... - .......[OKAY] -[OKAY] -transformer sparse_attn............ ............[NO]sparse_attn [NO]....... sparse_attn................... [OKAY] [OKAY][NO]............ - - .......[NO]transformer stochastic_transformer [OKAY]............ ....... - .[NO][OKAY] -transformer[NO]....... transformer............[OKAY]....... - ............[NO][OKAY] stochastic_transformer....... -[NO] [OKAY]........ - [OKAY][NO] - .......stochastic_transformer stochastic_transformer [OKAY] -.. [NO][NO] ....... .......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible ---------------------------------------------------ninja - .................. [OKAY] --------------------------------------------------- -cpu_adam op name............... ................[NO] installed....... ..[OKAY] -compatible --------------------------------------------------- -fused_adamcpu_adam ............................ [NO][NO] .............. [OKAY][OKAY] - -fused_lamb ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lambsparse_attn ......................... [NO][NO] .............. [OKAY][OKAY] - -transformer ............ [NO] .......ninja [OKAY] -.................. [OKAY]stochastic_transformersparse_attn - --------------------------------------------------............ -. op name[NO][NO] .............................. installed[OKAY] [OKAY] -.. - compatibletransformer - --------------------------------------------------............ - [NO] ....... [OKAY] -cpu_adam ............... stochastic_transformer[NO]ninja ....... ...................[OKAY] -[OKAY][NO] - ....... --------------------------------------------------[OKAY] - -op name ................ installed fused_adam.. .............compatible [NO] - --------------------------------------------------....... -[OKAY] -fused_lamb ............. cpu_adam[NO] ...................... [OKAY] -[NO] ....... [OKAY] -sparse_attnfused_adam ......................... [NO] [NO]....... .......[OKAY] - [OKAY] -transformer ............ fused_lamb[NO] .................... [OKAY] -[NO] ....... [OKAY]stochastic_transformer - . [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... async_io[NO] ...................... [NO][NO] - ....... [NO] -transformer_inference .. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -utils .................. [NO]utils ......................... [OKAY][NO] - ....... quantizer[OKAY] -.............. [NO] .......quantizer [OKAY].............. - [NO] ....... --------------------------------------------------[OKAY] - --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [NO] ....... [OKAY] -quantizer async_io.............. [NO] ...................... [NO][OKAY] -....... [NO] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] .......transformer_inference [NO].. - [NO] ....... [OKAY] -utils .................. [NO]transformer_inference ....... ..[OKAY] [NO] - ....... [OKAY] -quantizer .............. [NO] utils....... ..................[OKAY] -[NO] ....... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yumquantizer -.............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']............... -torch version .................... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']1.8.1 - -torch cuda versiontorch version ................................... 11.11.8.1 - -nvcc version torch cuda version..................... ...............11.2 -11.1deepspeed install path - nvcc version........... ..................... 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed install pathdeepspeed info .............................. 0.5.5+29bee73, 29bee73, master -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed wheel compiled w. - ......deepspeed info torch 1.8, cuda 11.1................... - 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]async_io ....... ...............[NO] -[NO] ....... [NO] -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [NO] utils....... ..................[OKAY] -[NO] .......quantizer [OKAY].............. - [NO] ....... quantizer[OKAY] -.............. [NO] .......-------------------------------------------------- -[OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed info deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] .......  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.[NO] - -async_iotransformer_inference ................. [NO][NO] .............. [OKAY][NO] - -utils .................. [NO] ....... [OKAY] -transformer_inference ..quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils ..................utils [NO].................. .......[NO] [OKAY]....... - [OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -async_io ............... [NO] ....... [NO] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -transformer_inference .. [NO] ....... [OKAY] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY]async_io - ...............utils [NO].................. [NO] .............. [NO][OKAY] - -quantizer .............. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -DeepSpeed general environment info: --------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed infoDeepSpeed general environment info: ................... -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch install pathtorch 1.8, cuda 11.1 - ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO]transformer_inference - .. [NO] ....... [OKAY] -utils .................. [NO]transformer_inference ....... [OKAY] -.. [NO] quantizer....... ..............[OKAY] -[NO] ....... [OKAY] -utils-------------------------------------------------- -.................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -async_io ............... [NO] ....... [NO] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -transformer_inference .. [NO] ....... [OKAY] -DeepSpeed general environment info: -utils .................. [NO] ....... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. utils[NO] ......................... [NO][OKAY] -....... [OKAY] -utils .................. quantizer[NO] ..................... [OKAY][NO] - ....... [OKAY]quantizer - .............. [NO] --------------------------------------------------....... - [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w.DeepSpeed general environment info: ...... -torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -DeepSpeed general environment info: -async_io ............... [NO] ....... [NO] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -transformer_inference .. [NO] ....... [OKAY] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - ......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] .......  [WARNING]  async_io: please install the libaio-devel package with yum[OKAY] - -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install pathtorch version ................................... 1.8.1 -torch cuda version['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -............... 11.1 -torch versionnvcc version ......................................... 1.8.111.2 - -deepspeed install pathtorch cuda version .......................... 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']nvcc version - .....................deepspeed info 11.2................... - 0.5.5+29bee73, 29bee73, masterdeepspeed install path - ...........deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> setting tensorboard ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op name -op name op nameop name ................ ................ ................................ installed installed installedinstalled .. .. .... compatible -compatiblecompatiblecompatible --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -cpu_adam cpu_adam............... ...............cpu_adam[NO] cpu_adam .......[NO].............................. [OKAY][NO].......[NO] - .......[OKAY]....... - [OKAY][OKAY] - -fused_adam ............. [NO]fused_adam .................... fused_adam [OKAY] fused_adam[NO] -............. ....................[NO] [NO][OKAY]fused_lamb....... - ....................[OKAY] -[OKAY][NO]fused_lamb - .................... fused_lamb [OKAY] [NO] - .............fused_lamb....... [NO].............[OKAY] -.......[NO] [OKAY]....... - [OKAY] -sparse_attn ............ [NO] ....... [OKAY]sparse_attn - ............sparse_attn transformer [NO] ............ ............ .......sparse_attn [NO] [NO][OKAY] ............ -....... ....... [NO] [OKAY] transformer[OKAY] -....... - ............[OKAY] transformer -[NO] ...................stochastic_transformer transformer[NO][OKAY] -.................... [NO][OKAY][NO]stochastic_transformer - .............. .[OKAY][OKAY] - -[NO]stochastic_transformer ....... .stochastic_transformer[OKAY] -[NO] ........ [OKAY][NO] - ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaJIT compiled ops requires ninja - - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY] [OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop name ................................op name................ installed installed................ installed.. ..installed..compatible -..compatiblecompatible-------------------------------------------------- - - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam cpu_adamcpu_adam...............cpu_adam [NO].............................. ...................... [NO][NO][OKAY][NO] - ..................... [OKAY][OKAY][OKAY] - - -fused_adam ............. fused_adamfused_adam[NO]fused_adam .............................................. [OKAY] [NO][NO] -[NO] ..................... [OKAY][OKAY]fused_lamb[OKAY] - - - fused_lambfused_lamb............. fused_lamb ............. [NO] ............. .............[NO]....... [NO].......[OKAY] [NO] -....... [OKAY] ....... -[OKAY] -[OKAY] -sparse_attn sparse_attnsparse_attn............ sparse_attn ............ ............[NO] ............ [NO].......[NO] [NO].......[OKAY]....... - [OKAY].......[OKAY] - -transformer[OKAY]transformertransformer -............ transformer[NO]............ ............ ............ ....... [NO] [NO][NO] [OKAY] -..................... [OKAY][OKAY][OKAY] -stochastic_transformer - - stochastic_transformer .stochastic_transformer .stochastic_transformer [NO] .[NO]........ .......[NO][OKAY] [NO] -....... [OKAY] ....... -[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY] ---------------------------------------------------[OKAY] - --------------------------------------------------- -op name --------------------------------------------------- -------------------------------------------------- -op name................ - op name op name................ installed ................ ................ installed.. installed installed.. .. compatible .. -compatiblecompatible -------------------------------------------------- - -compatible --------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -cpu_adam ...............cpu_adamcpu_adamcpu_adam [NO]............................................. [NO]....... [NO] [NO] .......[OKAY].............. - [OKAY][OKAY][OKAY] - - -fused_adam .............fused_adamfused_adam fused_adam [NO]............. ............. ............. .......[NO][NO][NO] [OKAY].............. -....... [OKAY][OKAY][OKAY]fused_lamb - - - .............fused_lamb fused_lambfused_lamb [NO] .......................... ............. ....... [NO][NO] [NO][OKAY] - ..................... [OKAY][OKAY][OKAY] - - -sparse_attnsparse_attnsparse_attn ............sparse_attn........................ ............ [NO][NO][NO] ....... [NO].............. ....... [OKAY][OKAY] [OKAY] - - -[OKAY] -transformertransformertransformer transformer ........................ ............ ............[NO] [NO][NO] [NO] .............. ....... .......[OKAY] [OKAY] -[OKAY][OKAY] - - -stochastic_transformerstochastic_transformerstochastic_transformerstochastic_transformer ... [NO][NO]. [NO] .............. ....... [NO][OKAY] -[OKAY] -[OKAY]....... - [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - - op nameop name................op name ................................installed................ installed installed.. installed .. .. ..compatible compatible - compatible -compatible --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -cpu_adamcpu_adam cpu_adam cpu_adam ............... .............................. ............... [NO] [NO][NO] [NO] ....... .............. [OKAY].......[OKAY] -[OKAY] - -[OKAY] -fused_adamfused_adam fused_adam fused_adam.......................... .............[NO]............. [NO] [NO][NO] ....... ..............[OKAY]....... - [OKAY][OKAY][OKAY] - - -fused_lamb fused_lamb.............fused_lambfused_lamb .............[NO].......................... .......[NO][NO] [NO] ....... [OKAY]....... - .......[OKAY][OKAY] - -[OKAY] -sparse_attnsparse_attnsparse_attnsparse_attn ................................................ [NO][NO][NO] [NO]....... ....... ....... .......[OKAY] [OKAY] -[OKAY] -[OKAY] - -transformer transformertransformertransformer ............ .................................... [NO][NO][NO][NO] .............. ..............[OKAY][OKAY] - -[OKAY][OKAY] - -stochastic_transformerstochastic_transformer stochastic_transformer .stochastic_transformer. . [NO] [NO][NO]. ..............[NO] ....... [OKAY][OKAY] ....... - -[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -JIT compiled ops requires ninja - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................. ..................[OKAY][OKAY] - -[OKAY][OKAY]-------------------------------------------------- - --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- -op name -op name op name................ op name ................ installed ................installed................ ..installed.. installedcompatible..compatible - ---------------------------------------------------.. - --------------------------------------------------compatiblecompatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam ......................cpu_adam cpu_adam [NO] [OKAY] ...................... -............... [NO][OKAY][NO] - .............. [OKAY][OKAY] - -fused_adam ............. [NO] ....... fused_adam[OKAY] -............. fused_lambfused_adam[NO] fused_adam .......................... .......[NO] ............. [NO][OKAY]....... -[NO] .......[OKAY].......fused_lamb - [OKAY].............[OKAY] - -[NO] fused_lamb....... fused_lamb[OKAY]............. - sparse_attn.............[NO] ............ [NO] .......[NO] .......[OKAY]....... - [OKAY][OKAY] - -sparse_attn transformer............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attntransformersparse_attn ............stochastic_transformer........................ [NO] . [NO][NO] .......[NO] .......[OKAY].............. - [OKAY][OKAY][OKAY] -stochastic_transformer - - .transformertransformer ............[NO]............ .......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - -JIT compiled ops requires ninja--------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................................... [OKAY] [OKAY][OKAY][OKAY] - - --------------------------------------------------- --------------------------------------------------- - -----------------------------------------------------------------------------------------------------op name -op name - op name................op name................ ................installed................installed .. installedinstalled .. compatible .... - compatible compatible ---------------------------------------------------compatible - ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [NO]cpu_adamcpu_adam cpu_adam ....... ............................................. [NO][OKAY][NO][NO] - .............. ....... [OKAY] [OKAY] -[OKAY] - -fused_adam ............. [NO] ....... fused_adam[OKAY]fused_adam - .............fused_adam............. .............[NO]fused_lamb[NO] [NO]........................... [OKAY][OKAY]....... -[NO] - [OKAY]....... - fused_lamb[OKAY] -fused_lamb............. fused_lamb ............. [NO] ............. [NO] ....... [NO] ....... [OKAY] ....... -[OKAY]sparse_attn - [OKAY]............ - [NO] ....... [OKAY] -transformer ............sparse_attn [NO]............ .......sparse_attn[NO] sparse_attn [OKAY] ............ -................... [NO]stochastic_transformer[NO] [OKAY] ....... -....... . [OKAY]transformer[OKAY] - -[NO]............ .......transformer[NO] transformer [OKAY] ............ -...................[NO] [OKAY][NO]....... - .......[OKAY] stochastic_transformer -[OKAY] -. [NO]stochastic_transformerstochastic_transformer ....... ..[OKAY] -[NO][NO] .............. [OKAY][OKAY] - -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb fused_lamb............. .............[NO] [NO]....... [OKAY]....... - [OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformer transformer............ ............ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninja ninja .................. ...................................................... [OKAY] [OKAY] -[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -op name--------------------------------------------------op name op name - ................................ op name................ installed ................installedinstalled .. installed .. ..compatible ..compatible -compatible - ---------------------------------------------------compatible---------------------------------------------------------------------------------------------------- - - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam cpu_adam.............................. [NO] ...............[NO]............... .......[NO][NO]....... [OKAY][OKAY]....... -....... - [OKAY][OKAY] - -fused_adam fused_adam............. fused_adam fused_adam[NO] .......................... ....... ............. [NO] [OKAY][NO] [NO] - ....... ....... .......fused_lamb [OKAY] [OKAY] [OKAY] - -............. - [NO]fused_lamb fused_lambfused_lamb ............. ....... ............. ............. [NO][OKAY] [NO] - [NO].............. .......[OKAY][OKAY] - -[OKAY] -sparse_attn ............ [NO] .......sparse_attn sparse_attn [OKAY]sparse_attn........................ - ............[NO][NO]transformer [NO].............. ................... [NO][OKAY] [OKAY] -[OKAY]....... - - transformer[OKAY] transformer............transformer - ............ ............[NO][NO] [NO]..............stochastic_transformer [OKAY].......[OKAY]. - -[OKAY][NO] - stochastic_transformer....... stochastic_transformer [OKAY] -stochastic_transformer . [NO]. ........[NO] [NO][OKAY]....... - .......[OKAY] -[OKAY] -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... ...............[NO] .......[NO] [OKAY]....... - [OKAY] -fused_adam .............fused_adam [NO]............. .......[NO] [OKAY]....... - [OKAY] -fused_lamb .............fused_lamb [NO]............. .......[NO] [OKAY]....... - [OKAY] -sparse_attn ............sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformer ............transformer [NO]............ .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name -op name op name................ op name ................................installed installed .................. installed .. compatible ..installed - compatible compatible-------------------------------------------------- -.. --------------------------------------------------- - ---------------------------------------------------compatible - --------------------------------------------------- -cpu_adam ............... [NO] cpu_adamcpu_adam....... cpu_adam ...............[OKAY] -..............................[NO] .......[NO][NO] [OKAY].............. - [OKAY][OKAY]fused_adam - - ............. [NO] ....... [OKAY] -fused_adam .............fused_lamb fused_adamfused_adam[NO]............. [NO]................................. [OKAY] .......[NO] -[NO] [OKAY]fused_lamb.............. - [OKAY].............[OKAY] -[NO] - .......fused_lamb fused_lamb [OKAY] -.......................... sparse_attn[NO][NO] .......................... [NO][OKAY][OKAY] - -.......sparse_attn [OKAY]............ - [NO] ....... [OKAY]transformer - ............ [NO] transformer....... sparse_attn[OKAY]............ -sparse_attn ............[NO]............ stochastic_transformer [NO] [NO]....... . ....... .......[OKAY][NO] - [OKAY].......[OKAY] - -[OKAY]stochastic_transformer -transformer transformer ............. [NO] [NO] ............ ..............[NO] [OKAY][OKAY] -....... - [OKAY] -stochastic_transformer .stochastic_transformer [NO] ....... .[OKAY] -[NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer ..............  [WARNING]  async_io: please install the libaio-devel package with yum[NO] ....... [OKAY] - --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yumutils .................. [NO] - ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils ..................transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -quantizer ..............utils [NO].................. .......[NO] [OKAY]....... - [OKAY] -quantizer-------------------------------------------------- -.............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ......DeepSpeed general environment info: torch 1.8, cuda 11.1 - -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -DeepSpeed general environment info: -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -quantizer .............. [NO] ....... [OKAY] -torch cuda version ............... 11.1 --------------------------------------------------- -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed info deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1663331.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 150 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.006 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 0.0001 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 3750000 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_bnb_optimizer ............................... False - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-24 11:35:28,287] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' ->>> done with dataset index builder. Compilation time: 0.298 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 22.490 seconds -time to initialize megatron (seconds): -8.884 -[after megatron is initialized] datetime: 2021-10-24 11:35:51 -building GPT model ... -[2021-10-24 11:35:51,183] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-24 11:35:51,184] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-24 11:35:51,184] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.64 GB, percent = 21.7% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-24 11:35:52,858] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - - - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -[2021-10-24 11:35:53,545] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-24 11:35:53,546] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-24 11:35:53,546] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.81 GB, percent = 21.8% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-24 11:35:53,547] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+29bee73, git-hash=29bee73, git-branch=master -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-24 11:35:53,584] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-24 11:35:53,584] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-24 11:35:53,584] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-24 11:35:53,584] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-24 11:35:53,584] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-24 11:35:53,584] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-24 11:35:53,584] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-24 11:35:53,584] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-24 11:35:53,584] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-24 11:35:53,584] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... -Loading extension module utils... -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils... -Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Time to load utils op: 0.8302216529846191 seconds -Time to load utils op: 0.8344728946685791 seconds -Time to load utils op: 0.8342630863189697 secondsTime to load utils op: 0.838498592376709 seconds - -Time to load utils op: 0.8504681587219238 seconds -Time to load utils op: 0.8487062454223633 seconds -Time to load utils op: 0.8500723838806152 seconds -Time to load utils op: 0.8512833118438721 seconds -Time to load utils op: 0.8479664325714111 seconds -Time to load utils op: 0.8588278293609619 secondsTime to load utils op: 0.8449485301971436 seconds -Time to load utils op: 0.8483796119689941 seconds - -Time to load utils op: 0.8526990413665771 seconds -Time to load utils op: 0.8520407676696777 seconds -Time to load utils op: 0.8521044254302979 secondsTime to load utils op: 0.8517298698425293 seconds - -Time to load utils op: 0.85263991355896 seconds -Time to load utils op: 0.8540716171264648 seconds -Time to load utils op: 0.8554840087890625 seconds -Time to load utils op: 0.8553693294525146 seconds -Time to load utils op: 0.8553249835968018 seconds -Time to load utils op: 0.8519527912139893 secondsTime to load utils op: 0.8512563705444336 seconds - -Time to load utils op: 0.8528563976287842 seconds -Time to load utils op: 0.8526935577392578 seconds -Time to load utils op: 0.8632745742797852 seconds -Time to load utils op: 0.8596935272216797 seconds -Time to load utils op: 0.8598127365112305 seconds -Time to load utils op: 0.8565092086791992 seconds -Time to load utils op: 0.8627548217773438 seconds -Time to load utils op: 0.857715368270874 seconds -Time to load utils op: 0.8603658676147461 seconds -Time to load utils op: 0.8575716018676758 seconds -Time to load utils op: 0.8628482818603516 seconds -Time to load utils op: 0.8581447601318359 secondsTime to load utils op: 0.8580036163330078 seconds -Time to load utils op: 0.8625712394714355 seconds - -Time to load utils op: 0.856050968170166 secondsTime to load utils op: 0.8628754615783691 seconds - -Time to load utils op: 0.856675386428833 seconds -Time to load utils op: 0.856414794921875 seconds -Time to load utils op: 0.861781120300293 seconds -Time to load utils op: 0.8561756610870361 seconds -Time to load utils op: 0.8626365661621094 seconds -Time to load utils op: 0.8626606464385986 seconds -Time to load utils op: 0.7746644020080566 seconds -Time to load utils op: 0.7764472961425781 seconds -Time to load utils op: 0.7736022472381592 seconds -Time to load utils op: 0.7811293601989746 seconds -Time to load utils op: 0.8562095165252686 seconds -Time to load utils op: 0.860353946685791 seconds -Time to load utils op: 0.8600208759307861 seconds -Time to load utils op: 0.8603348731994629 seconds -Time to load utils op: 0.8590161800384521 seconds -Time to load utils op: 0.8585484027862549 seconds -Time to load utils op: 0.8581526279449463 seconds -Time to load utils op: 0.8546805381774902 seconds -Time to load utils op: 0.8681614398956299 seconds -Time to load utils op: 0.8652572631835938 secondsTime to load utils op: 0.8619828224182129 seconds - -Time to load utils op: 0.860349178314209 seconds -Time to load utils op: 0.8633747100830078 secondsTime to load utils op: 0.8639359474182129 seconds - -Time to load utils op: 0.8711662292480469 seconds -Time to load utils op: 0.863490104675293 seconds -Time to load utils op: 0.8681488037109375 seconds -Time to load utils op: 0.8670237064361572 seconds -Time to load utils op: 0.8742547035217285 seconds -Time to load utils op: 0.8688106536865234 seconds -Time to load utils op: 0.8738753795623779 secondsTime to load utils op: 0.8700845241546631 seconds - -Time to load utils op: 0.8665471076965332 seconds -Time to load utils op: 0.8648872375488281 seconds -Time to load utils op: 0.8682146072387695 seconds -Time to load utils op: 0.8681774139404297 seconds -Time to load utils op: 0.8672311305999756 seconds -Time to load utils op: 0.8686769008636475 seconds -Time to load utils op: 0.8670134544372559 seconds -Time to load utils op: 0.8708889484405518 seconds -Time to load utils op: 0.8709547519683838 seconds -Time to load utils op: 0.8703811168670654 seconds -Time to load utils op: 0.7869884967803955 seconds -Time to load utils op: 0.7857961654663086 seconds -Time to load utils op: 0.7710707187652588 seconds -Time to load utils op: 0.7920010089874268 seconds -Time to load utils op: 0.8696708679199219 seconds -Time to load utils op: 0.8664712905883789 secondsTime to load utils op: 0.8690955638885498 seconds - -Time to load utils op: 0.8664894104003906 seconds -Time to load utils op: 0.8682262897491455 seconds -Time to load utils op: 0.8679285049438477 seconds -Time to load utils op: 0.867964506149292 seconds -Time to load utils op: 0.8702938556671143 seconds -Time to load utils op: 0.8640217781066895 seconds -Time to load utils op: 0.8632235527038574 seconds -Time to load utils op: 0.8605461120605469 secondsTime to load utils op: 0.8632116317749023 seconds - -Time to load utils op: 0.8649649620056152 seconds -Time to load utils op: 0.8662021160125732 seconds -Time to load utils op: 0.8650853633880615 seconds -Time to load utils op: 0.8635947704315186 seconds -Time to load utils op: 0.8926258087158203 seconds -Time to load utils op: 0.8975427150726318 seconds -Time to load utils op: 0.8934750556945801 seconds -Time to load utils op: 0.8959805965423584 seconds -Time to load utils op: 0.8825545310974121 secondsTime to load utils op: 0.8858566284179688 seconds - -Time to load utils op: 0.8866019248962402 seconds -Time to load utils op: 0.8861570358276367 seconds -Time to load utils op: 0.8972680568695068 secondsTime to load utils op: 0.8985023498535156 seconds - -Time to load utils op: 0.8909170627593994 secondsTime to load utils op: 0.8932063579559326 seconds - -Time to load utils op: 0.8932147026062012 seconds -Time to load utils op: 0.8897578716278076 seconds -Time to load utils op: 0.8968179225921631 seconds -Time to load utils op: 0.8993067741394043 seconds -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.8984949588775635 secondsTime to load utils op: 0.900780439376831 seconds - -Time to load utils op: 0.907141923904419 seconds -Time to load utils op: 0.9000320434570312 seconds -Time to load utils op: 0.8930096626281738 seconds -Time to load utils op: 0.8989226818084717 secondsTime to load utils op: 0.8999402523040771 seconds - -Time to load utils op: 0.9001352787017822 seconds -Time to load utils op: 0.8944320678710938 seconds -Time to load utils op: 0.8950750827789307 seconds -Time to load utils op: 0.8946633338928223 seconds -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] - -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - - -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.002484560012817383 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.003776073455810547 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.0020303726196289062 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.002407550811767578 seconds -Time to load utils op: 0.0041043758392333984 secondsTime to load utils op: 0.0036804676055908203 seconds - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0023069381713867188 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.003775358200073242 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.003765106201171875 secondsTime to load utils op: 0.0038187503814697266 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.0036749839782714844 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0030739307403564453 secondsTime to load utils op: 0.0030896663665771484 seconds - -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.003065347671508789 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.003465890884399414 secondsTime to load utils op: 0.003370046615600586 seconds - -Time to load utils op: 0.003292083740234375 seconds -Time to load utils op: 0.0032949447631835938 secondsTime to load utils op: 0.0035486221313476562 seconds - -Time to load utils op: 0.003609180450439453 seconds -Time to load utils op: 0.003389596939086914 seconds -Time to load utils op: 0.0030858516693115234 seconds -Time to load utils op: 0.003204822540283203 seconds -Time to load utils op: 0.003223896026611328 seconds -Time to load utils op: 0.003372669219970703 seconds -Time to load utils op: 0.0032389163970947266 seconds -Time to load utils op: 0.0032835006713867188 seconds -Time to load utils op: 0.0031964778900146484 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0030930042266845703 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.003587007522583008 seconds -Time to load utils op: 0.0031595230102539062 secondsTime to load utils op: 0.003055095672607422 seconds - -Time to load utils op: 0.0030705928802490234 secondsTime to load utils op: 0.0034742355346679688 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.003176450729370117 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Time to load utils op: 0.003040790557861328 seconds -Time to load utils op: 0.003320932388305664 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0028858184814453125 seconds -Time to load utils op: 0.003056049346923828 seconds -Loading extension module utils... -Time to load utils op: 0.0030782222747802734 seconds -Time to load utils op: 0.0033597946166992188 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0029904842376708984 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0030317306518554688 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0031151771545410156 secondsTime to load utils op: 0.003094911575317383 seconds -Loading extension module utils... -Time to load utils op: 0.003089427947998047 seconds -Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0028891563415527344 seconds -Time to load utils op: 0.0029153823852539062 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - - -Time to load utils op: 0.003506183624267578 secondsTime to load utils op: 0.0029909610748291016 seconds - -Time to load utils op: 0.0031280517578125 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.00478672981262207 seconds -Time to load utils op: 0.0029027462005615234 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... -Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.004776716232299805 seconds -Time to load utils op: 0.005040884017944336 secondsTime to load utils op: 0.005041599273681641 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.004145383834838867 seconds -Time to load utils op: 0.004156351089477539 seconds -Time to load utils op: 0.0036957263946533203 seconds -Time to load utils op: 0.0042002201080322266 seconds -Time to load utils op: 0.0038416385650634766 seconds -Time to load utils op: 0.004149913787841797 seconds -Time to load utils op: 0.004646778106689453 seconds -Time to load utils op: 0.003937482833862305 seconds -Time to load utils op: 0.004224061965942383 seconds -Time to load utils op: 0.0043942928314208984 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.004404783248901367 seconds -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0038182735443115234 seconds - -Loading extension module utils... -Time to load utils op: 0.0038459300994873047 seconds -Time to load utils op: 0.003916025161743164 seconds -Time to load utils op: 0.004178524017333984 seconds - -Loading extension module utils... -Time to load utils op: 0.004153251647949219 seconds -Time to load utils op: 0.003821134567260742 seconds -Time to load utils op: 0.004304170608520508 seconds -Time to load utils op: 0.004064321517944336 secondsTime to load utils op: 0.003920793533325195 seconds - -Time to load utils op: 0.003668546676635742 seconds -Time to load utils op: 0.0038263797760009766 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.00417017936706543 seconds -Time to load utils op: 0.0040166378021240234 seconds -Time to load utils op: 0.004176139831542969 seconds -Time to load utils op: 0.0036344528198242188 seconds -Time to load utils op: 0.00403594970703125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0037462711334228516 secondsTime to load utils op: 0.0038285255432128906 seconds - -Time to load utils op: 0.0038454532623291016 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.003627300262451172 seconds -Time to load utils op: 0.0034530162811279297 seconds -Time to load utils op: 0.004186868667602539 seconds -Time to load utils op: 0.003920793533325195 seconds -Time to load utils op: 0.003500699996948242 seconds -Time to load utils op: 0.0035796165466308594 seconds -Time to load utils op: 0.004035472869873047 secondsTime to load utils op: 0.0042572021484375 seconds - -Time to load utils op: 0.005398750305175781 secondsTime to load utils op: 0.005130290985107422 seconds - -Time to load utils op: 0.0033025741577148438 seconds -Time to load utils op: 0.003391265869140625 seconds -Time to load utils op: 0.0040547847747802734 seconds -Time to load utils op: 0.0035588741302490234 seconds -Time to load utils op: 0.002727985382080078 seconds -Time to load utils op: 0.0028395652770996094 seconds -Time to load utils op: 0.003322124481201172 secondsTime to load utils op: 0.0035293102264404297 seconds - -Time to load utils op: 0.0027532577514648438 seconds -Time to load utils op: 0.0031838417053222656 secondsTime to load utils op: 0.003141641616821289 seconds - -Time to load utils op: 0.003462076187133789 seconds -Time to load utils op: 0.003360748291015625 seconds -Time to load utils op: 0.0034737586975097656 seconds -Time to load utils op: 0.0038056373596191406 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0051648616790771484 seconds -[2021-10-24 11:35:56,303] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -Time to load utils op: 0.005126237869262695 secondsTime to load utils op: 0.004947185516357422 seconds - -Time to load utils op: 0.0049893856048583984 seconds -[2021-10-24 11:35:56,304] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -[2021-10-24 11:35:56,304] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.83 GB, percent = 21.8% -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.006906270980834961 seconds -Time to load utils op: 0.007209300994873047 seconds -Time to load utils op: 0.006829500198364258 seconds -Time to load utils op: 0.007579326629638672 seconds -Time to load utils op: 0.006812095642089844 seconds -Time to load utils op: 0.007146358489990234 seconds -Time to load utils op: 0.0070912837982177734 seconds -Time to load utils op: 0.007330179214477539 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012650489807128906 seconds -Time to load utils op: 0.001054525375366211 seconds -Time to load utils op: 0.0009791851043701172 seconds -Time to load utils op: 0.0010471343994140625 seconds -[2021-10-24 11:35:56,355] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-24 11:35:56,356] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-24 11:35:56,356] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.83 GB, percent = 21.8% -[2021-10-24 11:35:56,356] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-24 11:35:56,388] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-24 11:35:56,388] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-24 11:35:56,388] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.83 GB, percent = 21.8% -[2021-10-24 11:35:56,388] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-24 11:35:56,389] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-24 11:35:56,389] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-24 11:35:56,389] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-24 11:35:56,389] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-24 11:35:56,389] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-24 11:35:56,390] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-24 11:35:56,391] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-24 11:35:56,391] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0008513927459716797 seconds -[2021-10-24 11:35:56,392] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-24 11:35:56,784] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,784] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,784] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,784] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,785] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -WARNING: could not find the metadata file /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. - will not load any checkpoints and will start from random -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,873] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,874] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,874] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,874] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,874] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,874] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,874] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,874] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,874] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,875] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,876] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,876] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,876] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,876] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,879] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,879] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,879] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,879] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,880] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,880] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,880] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-10-24 11:35:56,880] [WARNING] [engine.py:2025:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -time (ms) | load-checkpoint: 9.74 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.368064 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.368064 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504estimated model parameters: 125.2213504estimated model parameters: 125.2213504 - - -estimated model parameters: 103.3650944 - -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-24 11:35:56 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 20008960 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.348006 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.346 seconds - total number of samples: 657686117 - total number of epochs: 5 - > WARNING: could not find index map files, building the indices on rank 0 ... - > last epoch number of samples (6154639) is larger than 80% of number of samples per epoch (6927160), setting separate_last_epoch to False - > elasped time to build and save doc-idx mapping (seconds): 4.577712 - using: - number of documents: 15211521 - number of epochs: 3 - sequence length: 2048 - total number of samples: 20781482 - > elasped time to build and save sample-idx mapping (seconds): 1.051683 - > building shuffle index with split [0, 20781482) and [20781482, 20781482) ... - > elasped time to build and save shuffle-idx mapping (seconds): 1.168543 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.120 seconds - total number of samples: 20781483 - total number of epochs: 3 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.079 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-24 11:36:12 -done with setup ... -training ... -time (ms) | model-and-optimizer-setup: 5763.97 | train/valid/test-data-iterators-setup: 13586.42 -Number of parameters: 125.2213504 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-24 11:36:12 -[2021-10-24 11:36:12,609] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-24 11:36:12,610] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-24 11:36:12,610] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-24 11:36:12,610] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-24 11:36:12,610] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -[Rank 3] (after 1 iterations) memory (MB) | allocated: 13201.26220703125 | max allocated: 20664.81103515625 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 7] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 11] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 19] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 15] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 - iteration 1/ 292968 | consumed samples: 2048 | consumed tokens: 131072 | elapsed time per iteration (ms): 215777.7 | learning rate: 5.461E-08 | global batch size: 2048 | lm loss: 1.104119E+01 | loss scale: 4096.0 | grad norm: 261416.473 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -[Rank 127] (after 1 iterations) memory (MB) | allocated: 13082.80859375 | max allocated: 20546.41455078125 | reserved: 24406.0 | max reserved: 24406.0 -time (ms) -[Rank 27] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 31] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 23] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 35] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 1] (after 1 iterations) memory (MB) | allocated: 13202.80712890625 | max allocated: 20666.35595703125 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 5] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 9] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 17] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 13] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 125] (after 1 iterations) memory (MB) | allocated: 13082.78955078125 | max allocated: 20546.3955078125 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 47] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 43] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 6] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 2] (after 1 iterations) memory (MB) | allocated: 13201.74267578125 | max allocated: 20665.29150390625 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 51] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 39] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 55] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 10] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 14] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 25] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 126] (after 1 iterations) memory (MB) | allocated: 13082.90283203125 | max allocated: 20546.5087890625 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 63] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 67] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 59] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 18] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 71] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 29] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 75] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 33] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 0] (after 1 iterations) memory (MB) | allocated: 13201.93408203125 | max allocated: 20665.48291015625 | reserved: 24442.0 | max reserved: 24442.0 -[Rank 26] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 79] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 4] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 30] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 34] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 22] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 124] (after 1 iterations) memory (MB) | allocated: 13082.6953125 | max allocated: 20546.30126953125 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 21] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 83] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 20] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 12] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 8] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 87] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 91] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 99] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 16] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 103] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 95] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 107] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 24] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 28] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 111] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 32] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 115] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 119] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 123] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 41] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0[Rank 40] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 - -[Rank 50] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 46] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 42] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 48] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 45] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 49] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 38] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 44] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 52] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 54] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 56] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 58] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 53] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 57] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 36] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 65] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 37] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 69] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 62] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0[Rank 61] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 - -[Rank 60] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 64] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0[Rank 66] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 - -[Rank 68] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0[Rank 70] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 - -[Rank 72] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 73] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 74] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 76] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 77] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 78] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 86] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 82] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0[Rank 81] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 - -[Rank 80] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 90] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 94] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 98] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 102] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 106] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 110] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 114] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 118] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 122] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 89] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 93] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 97] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 92] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 88] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 96] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 101] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0[Rank 100] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 - -[Rank 109] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 104] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0[Rank 105] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 - -[Rank 108] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 -[Rank 84] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 85] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 116] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 113] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 112] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 117] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 120] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 -[Rank 121] (after 1 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 - iteration 2/ 292968 | consumed samples: 4096 | consumed tokens: 262144 | elapsed time per iteration (ms): 150741.8 | learning rate: 1.092E-07 | global batch size: 2048 | lm loss: 1.104001E+01 | loss scale: 4096.0 | grad norm: 262433.480 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3/ 292968 | consumed samples: 6144 | consumed tokens: 393216 | elapsed time per iteration (ms): 140059.4 | learning rate: 1.638E-07 | global batch size: 2048 | lm loss: 1.089435E+01 | loss scale: 4096.0 | grad norm: 260068.314 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 4/ 292968 | consumed samples: 8192 | consumed tokens: 524288 | elapsed time per iteration (ms): 138400.5 | learning rate: 2.185E-07 | global batch size: 2048 | lm loss: 9.762675E+00 | loss scale: 4096.0 | grad norm: 150083.674 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 5/ 292968 | consumed samples: 10240 | consumed tokens: 655360 | elapsed time per iteration (ms): 139220.6 | learning rate: 2.731E-07 | global batch size: 2048 | lm loss: 1.130721E+01 | loss scale: 4096.0 | grad norm: 1528994.109 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 6/ 292968 | consumed samples: 12288 | consumed tokens: 786432 | elapsed time per iteration (ms): 100839.1 | learning rate: 3.277E-07 | global batch size: 2048 | lm loss: 1.116081E+01 | loss scale: 4096.0 | grad norm: 858542.292 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 7/ 292968 | consumed samples: 14336 | consumed tokens: 917504 | elapsed time per iteration (ms): 108711.0 | learning rate: 3.823E-07 | global batch size: 2048 | lm loss: 1.067137E+01 | loss scale: 4096.0 | grad norm: 903248.291 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 8/ 292968 | consumed samples: 16384 | consumed tokens: 1048576 | elapsed time per iteration (ms): 157271.1 | learning rate: 4.369E-07 | global batch size: 2048 | lm loss: 9.884519E+00 | loss scale: 4096.0 | grad norm: 587153.416 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 9/ 292968 | consumed samples: 18432 | consumed tokens: 1179648 | elapsed time per iteration (ms): 153391.1 | learning rate: 4.915E-07 | global batch size: 2048 | lm loss: 9.576445E+00 | loss scale: 4096.0 | grad norm: 166008.554 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 10/ 292968 | consumed samples: 20480 | consumed tokens: 1310720 | elapsed time per iteration (ms): 142148.9 | learning rate: 5.461E-07 | global batch size: 2048 | lm loss: 9.377088E+00 | loss scale: 4096.0 | grad norm: 97118.035 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 11/ 292968 | consumed samples: 22528 | consumed tokens: 1441792 | elapsed time per iteration (ms): 159856.5 | learning rate: 6.007E-07 | global batch size: 2048 | lm loss: 9.444679E+00 | loss scale: 4096.0 | grad norm: 439206.545 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 12/ 292968 | consumed samples: 24576 | consumed tokens: 1572864 | elapsed time per iteration (ms): 125421.3 | learning rate: 6.554E-07 | global batch size: 2048 | lm loss: 1.034726E+01 | loss scale: 4096.0 | grad norm: 868844.544 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 13/ 292968 | consumed samples: 26624 | consumed tokens: 1703936 | elapsed time per iteration (ms): 126101.7 | learning rate: 7.100E-07 | global batch size: 2048 | lm loss: 9.303679E+00 | loss scale: 4096.0 | grad norm: 191347.120 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 14/ 292968 | consumed samples: 28672 | consumed tokens: 1835008 | elapsed time per iteration (ms): 124492.4 | learning rate: 7.646E-07 | global batch size: 2048 | lm loss: 9.127639E+00 | loss scale: 4096.0 | grad norm: 78849.008 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 15/ 292968 | consumed samples: 30720 | consumed tokens: 1966080 | elapsed time per iteration (ms): 124999.6 | learning rate: 8.192E-07 | global batch size: 2048 | lm loss: 9.099547E+00 | loss scale: 4096.0 | grad norm: 82243.146 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 16/ 292968 | consumed samples: 32768 | consumed tokens: 2097152 | elapsed time per iteration (ms): 117227.7 | learning rate: 8.738E-07 | global batch size: 2048 | lm loss: 8.988091E+00 | loss scale: 4096.0 | grad norm: 75136.508 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 17/ 292968 | consumed samples: 34816 | consumed tokens: 2228224 | elapsed time per iteration (ms): 118910.7 | learning rate: 9.284E-07 | global batch size: 2048 | lm loss: 8.833913E+00 | loss scale: 4096.0 | grad norm: 47455.586 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 18/ 292968 | consumed samples: 36864 | consumed tokens: 2359296 | elapsed time per iteration (ms): 111138.1 | learning rate: 9.830E-07 | global batch size: 2048 | lm loss: 8.794515E+00 | loss scale: 4096.0 | grad norm: 116474.981 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 19/ 292968 | consumed samples: 38912 | consumed tokens: 2490368 | elapsed time per iteration (ms): 118823.4 | learning rate: 1.038E-06 | global batch size: 2048 | lm loss: 8.704759E+00 | loss scale: 4096.0 | grad norm: 71486.803 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 20/ 292968 | consumed samples: 40960 | consumed tokens: 2621440 | elapsed time per iteration (ms): 115637.3 | learning rate: 1.092E-06 | global batch size: 2048 | lm loss: 8.667233E+00 | loss scale: 4096.0 | grad norm: 71556.371 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 21/ 292968 | consumed samples: 43008 | consumed tokens: 2752512 | elapsed time per iteration (ms): 126253.4 | learning rate: 1.147E-06 | global batch size: 2048 | lm loss: 8.571645E+00 | loss scale: 4096.0 | grad norm: 43307.146 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 22/ 292968 | consumed samples: 45056 | consumed tokens: 2883584 | elapsed time per iteration (ms): 114040.9 | learning rate: 1.201E-06 | global batch size: 2048 | lm loss: 8.597071E+00 | loss scale: 4096.0 | grad norm: 56901.877 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 23/ 292968 | consumed samples: 47104 | consumed tokens: 3014656 | elapsed time per iteration (ms): 130940.9 | learning rate: 1.256E-06 | global batch size: 2048 | lm loss: 8.552147E+00 | loss scale: 4096.0 | grad norm: 27945.872 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 24/ 292968 | consumed samples: 49152 | consumed tokens: 3145728 | elapsed time per iteration (ms): 126515.8 | learning rate: 1.311E-06 | global batch size: 2048 | lm loss: 8.514710E+00 | loss scale: 4096.0 | grad norm: 27435.939 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 25/ 292968 | consumed samples: 51200 | consumed tokens: 3276800 | elapsed time per iteration (ms): 114228.0 | learning rate: 1.365E-06 | global batch size: 2048 | lm loss: 8.525074E+00 | loss scale: 4096.0 | grad norm: 87266.386 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 26/ 292968 | consumed samples: 53248 | consumed tokens: 3407872 | elapsed time per iteration (ms): 121080.8 | learning rate: 1.420E-06 | global batch size: 2048 | lm loss: 8.503829E+00 | loss scale: 4096.0 | grad norm: 53806.253 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 27/ 292968 | consumed samples: 55296 | consumed tokens: 3538944 | elapsed time per iteration (ms): 109511.9 | learning rate: 1.475E-06 | global batch size: 2048 | lm loss: 8.426759E+00 | loss scale: 4096.0 | grad norm: 45280.155 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 28/ 292968 | consumed samples: 57344 | consumed tokens: 3670016 | elapsed time per iteration (ms): 125610.3 | learning rate: 1.529E-06 | global batch size: 2048 | lm loss: 8.442092E+00 | loss scale: 4096.0 | grad norm: 33438.298 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 29/ 292968 | consumed samples: 59392 | consumed tokens: 3801088 | elapsed time per iteration (ms): 113773.2 | learning rate: 1.584E-06 | global batch size: 2048 | lm loss: 8.389614E+00 | loss scale: 4096.0 | grad norm: 29346.871 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 30/ 292968 | consumed samples: 61440 | consumed tokens: 3932160 | elapsed time per iteration (ms): 115546.8 | learning rate: 1.638E-06 | global batch size: 2048 | lm loss: 8.368752E+00 | loss scale: 4096.0 | grad norm: 37240.694 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 31/ 292968 | consumed samples: 63488 | consumed tokens: 4063232 | elapsed time per iteration (ms): 114919.3 | learning rate: 1.693E-06 | global batch size: 2048 | lm loss: 8.377337E+00 | loss scale: 4096.0 | grad norm: 51611.962 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 32/ 292968 | consumed samples: 65536 | consumed tokens: 4194304 | elapsed time per iteration (ms): 115764.0 | learning rate: 1.748E-06 | global batch size: 2048 | lm loss: 8.402411E+00 | loss scale: 4096.0 | grad norm: 61528.415 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 33/ 292968 | consumed samples: 67584 | consumed tokens: 4325376 | elapsed time per iteration (ms): 124382.1 | learning rate: 1.802E-06 | global batch size: 2048 | lm loss: 8.312696E+00 | loss scale: 4096.0 | grad norm: 24010.215 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 34/ 292968 | consumed samples: 69632 | consumed tokens: 4456448 | elapsed time per iteration (ms): 109629.6 | learning rate: 1.857E-06 | global batch size: 2048 | lm loss: 8.273209E+00 | loss scale: 4096.0 | grad norm: 30945.790 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 35/ 292968 | consumed samples: 71680 | consumed tokens: 4587520 | elapsed time per iteration (ms): 124329.8 | learning rate: 1.911E-06 | global batch size: 2048 | lm loss: 8.289178E+00 | loss scale: 4096.0 | grad norm: 32987.729 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 36/ 292968 | consumed samples: 73728 | consumed tokens: 4718592 | elapsed time per iteration (ms): 125951.1 | learning rate: 1.966E-06 | global batch size: 2048 | lm loss: 8.222873E+00 | loss scale: 4096.0 | grad norm: 21715.211 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 37/ 292968 | consumed samples: 75776 | consumed tokens: 4849664 | elapsed time per iteration (ms): 120397.8 | learning rate: 2.021E-06 | global batch size: 2048 | lm loss: 8.240078E+00 | loss scale: 4096.0 | grad norm: 17729.094 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 38/ 292968 | consumed samples: 77824 | consumed tokens: 4980736 | elapsed time per iteration (ms): 115861.8 | learning rate: 2.075E-06 | global batch size: 2048 | lm loss: 8.185006E+00 | loss scale: 4096.0 | grad norm: 22333.806 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 39/ 292968 | consumed samples: 79872 | consumed tokens: 5111808 | elapsed time per iteration (ms): 109736.6 | learning rate: 2.130E-06 | global batch size: 2048 | lm loss: 8.259721E+00 | loss scale: 4096.0 | grad norm: 62233.185 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 40/ 292968 | consumed samples: 81920 | consumed tokens: 5242880 | elapsed time per iteration (ms): 106457.8 | learning rate: 2.185E-06 | global batch size: 2048 | lm loss: 8.176363E+00 | loss scale: 4096.0 | grad norm: 24827.400 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 41/ 292968 | consumed samples: 83968 | consumed tokens: 5373952 | elapsed time per iteration (ms): 109620.3 | learning rate: 2.239E-06 | global batch size: 2048 | lm loss: 8.170617E+00 | loss scale: 4096.0 | grad norm: 25861.100 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 42/ 292968 | consumed samples: 86016 | consumed tokens: 5505024 | elapsed time per iteration (ms): 106008.2 | learning rate: 2.294E-06 | global batch size: 2048 | lm loss: 8.115204E+00 | loss scale: 4096.0 | grad norm: 18760.832 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 43/ 292968 | consumed samples: 88064 | consumed tokens: 5636096 | elapsed time per iteration (ms): 104678.3 | learning rate: 2.348E-06 | global batch size: 2048 | lm loss: 8.103595E+00 | loss scale: 4096.0 | grad norm: 24468.237 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 44/ 292968 | consumed samples: 90112 | consumed tokens: 5767168 | elapsed time per iteration (ms): 106775.0 | learning rate: 2.403E-06 | global batch size: 2048 | lm loss: 8.097460E+00 | loss scale: 4096.0 | grad norm: 28875.772 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 45/ 292968 | consumed samples: 92160 | consumed tokens: 5898240 | elapsed time per iteration (ms): 108332.6 | learning rate: 2.458E-06 | global batch size: 2048 | lm loss: 8.078686E+00 | loss scale: 4096.0 | grad norm: 22659.751 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 46/ 292968 | consumed samples: 94208 | consumed tokens: 6029312 | elapsed time per iteration (ms): 109675.1 | learning rate: 2.512E-06 | global batch size: 2048 | lm loss: 8.059828E+00 | loss scale: 4096.0 | grad norm: 20091.720 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 47/ 292968 | consumed samples: 96256 | consumed tokens: 6160384 | elapsed time per iteration (ms): 111994.1 | learning rate: 2.567E-06 | global batch size: 2048 | lm loss: 7.996720E+00 | loss scale: 4096.0 | grad norm: 16327.955 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 48/ 292968 | consumed samples: 98304 | consumed tokens: 6291456 | elapsed time per iteration (ms): 108855.1 | learning rate: 2.621E-06 | global batch size: 2048 | lm loss: 8.016587E+00 | loss scale: 4096.0 | grad norm: 26369.002 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 49/ 292968 | consumed samples: 100352 | consumed tokens: 6422528 | elapsed time per iteration (ms): 103845.5 | learning rate: 2.676E-06 | global batch size: 2048 | lm loss: 7.984880E+00 | loss scale: 4096.0 | grad norm: 19863.681 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 50/ 292968 | consumed samples: 102400 | consumed tokens: 6553600 | elapsed time per iteration (ms): 104797.4 | learning rate: 2.731E-06 | global batch size: 2048 | lm loss: 7.966887E+00 | loss scale: 4096.0 | grad norm: 26876.409 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 51/ 292968 | consumed samples: 104448 | consumed tokens: 6684672 | elapsed time per iteration (ms): 104701.5 | learning rate: 2.785E-06 | global batch size: 2048 | lm loss: 7.961477E+00 | loss scale: 4096.0 | grad norm: 33274.161 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 52/ 292968 | consumed samples: 106496 | consumed tokens: 6815744 | elapsed time per iteration (ms): 117371.0 | learning rate: 2.840E-06 | global batch size: 2048 | lm loss: 7.924062E+00 | loss scale: 4096.0 | grad norm: 23619.820 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 53/ 292968 | consumed samples: 108544 | consumed tokens: 6946816 | elapsed time per iteration (ms): 100537.7 | learning rate: 2.895E-06 | global batch size: 2048 | lm loss: 7.961209E+00 | loss scale: 4096.0 | grad norm: 27558.631 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 54/ 292968 | consumed samples: 110592 | consumed tokens: 7077888 | elapsed time per iteration (ms): 107883.6 | learning rate: 2.949E-06 | global batch size: 2048 | lm loss: 7.918924E+00 | loss scale: 4096.0 | grad norm: 17735.411 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 55/ 292968 | consumed samples: 112640 | consumed tokens: 7208960 | elapsed time per iteration (ms): 113286.8 | learning rate: 3.004E-06 | global batch size: 2048 | lm loss: 7.924952E+00 | loss scale: 4096.0 | grad norm: 35059.058 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 56/ 292968 | consumed samples: 114688 | consumed tokens: 7340032 | elapsed time per iteration (ms): 108019.4 | learning rate: 3.058E-06 | global batch size: 2048 | lm loss: 7.873817E+00 | loss scale: 4096.0 | grad norm: 23324.724 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 57/ 292968 | consumed samples: 116736 | consumed tokens: 7471104 | elapsed time per iteration (ms): 110237.6 | learning rate: 3.113E-06 | global batch size: 2048 | lm loss: 7.832249E+00 | loss scale: 4096.0 | grad norm: 22962.810 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 58/ 292968 | consumed samples: 118784 | consumed tokens: 7602176 | elapsed time per iteration (ms): 118075.5 | learning rate: 3.168E-06 | global batch size: 2048 | lm loss: 7.802713E+00 | loss scale: 4096.0 | grad norm: 26284.961 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 59/ 292968 | consumed samples: 120832 | consumed tokens: 7733248 | elapsed time per iteration (ms): 108952.9 | learning rate: 3.222E-06 | global batch size: 2048 | lm loss: 7.783186E+00 | loss scale: 4096.0 | grad norm: 19567.530 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 60/ 292968 | consumed samples: 122880 | consumed tokens: 7864320 | elapsed time per iteration (ms): 133287.9 | learning rate: 3.277E-06 | global batch size: 2048 | lm loss: 7.789031E+00 | loss scale: 4096.0 | grad norm: 24365.611 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 61/ 292968 | consumed samples: 124928 | consumed tokens: 7995392 | elapsed time per iteration (ms): 121268.9 | learning rate: 3.331E-06 | global batch size: 2048 | lm loss: 7.761158E+00 | loss scale: 4096.0 | grad norm: 21464.688 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 62/ 292968 | consumed samples: 126976 | consumed tokens: 8126464 | elapsed time per iteration (ms): 106597.2 | learning rate: 3.386E-06 | global batch size: 2048 | lm loss: 7.729983E+00 | loss scale: 4096.0 | grad norm: 27308.739 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 63/ 292968 | consumed samples: 129024 | consumed tokens: 8257536 | elapsed time per iteration (ms): 119244.1 | learning rate: 3.441E-06 | global batch size: 2048 | lm loss: 7.798817E+00 | loss scale: 4096.0 | grad norm: 63342.330 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 64/ 292968 | consumed samples: 131072 | consumed tokens: 8388608 | elapsed time per iteration (ms): 120042.3 | learning rate: 3.495E-06 | global batch size: 2048 | lm loss: 7.755435E+00 | loss scale: 4096.0 | grad norm: 52280.137 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 65/ 292968 | consumed samples: 133120 | consumed tokens: 8519680 | elapsed time per iteration (ms): 120878.9 | learning rate: 3.550E-06 | global batch size: 2048 | lm loss: 7.715120E+00 | loss scale: 4096.0 | grad norm: 23561.567 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 66/ 292968 | consumed samples: 135168 | consumed tokens: 8650752 | elapsed time per iteration (ms): 107785.4 | learning rate: 3.604E-06 | global batch size: 2048 | lm loss: 7.706885E+00 | loss scale: 4096.0 | grad norm: 28158.448 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 67/ 292968 | consumed samples: 137216 | consumed tokens: 8781824 | elapsed time per iteration (ms): 120247.9 | learning rate: 3.659E-06 | global batch size: 2048 | lm loss: 7.651459E+00 | loss scale: 4096.0 | grad norm: 17741.711 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 68/ 292968 | consumed samples: 139264 | consumed tokens: 8912896 | elapsed time per iteration (ms): 118207.3 | learning rate: 3.714E-06 | global batch size: 2048 | lm loss: 7.638219E+00 | loss scale: 4096.0 | grad norm: 29792.122 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 69/ 292968 | consumed samples: 141312 | consumed tokens: 9043968 | elapsed time per iteration (ms): 112529.3 | learning rate: 3.768E-06 | global batch size: 2048 | lm loss: 7.667919E+00 | loss scale: 4096.0 | grad norm: 28840.534 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 70/ 292968 | consumed samples: 143360 | consumed tokens: 9175040 | elapsed time per iteration (ms): 115922.6 | learning rate: 3.823E-06 | global batch size: 2048 | lm loss: 7.676429E+00 | loss scale: 4096.0 | grad norm: 30859.853 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 71/ 292968 | consumed samples: 145408 | consumed tokens: 9306112 | elapsed time per iteration (ms): 109491.8 | learning rate: 3.878E-06 | global batch size: 2048 | lm loss: 7.579247E+00 | loss scale: 4096.0 | grad norm: 16607.983 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 72/ 292968 | consumed samples: 147456 | consumed tokens: 9437184 | elapsed time per iteration (ms): 100383.3 | learning rate: 3.932E-06 | global batch size: 2048 | lm loss: 7.640097E+00 | loss scale: 4096.0 | grad norm: 50007.876 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 73/ 292968 | consumed samples: 149504 | consumed tokens: 9568256 | elapsed time per iteration (ms): 107291.8 | learning rate: 3.987E-06 | global batch size: 2048 | lm loss: 7.628377E+00 | loss scale: 4096.0 | grad norm: 39217.411 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 74/ 292968 | consumed samples: 151552 | consumed tokens: 9699328 | elapsed time per iteration (ms): 103277.6 | learning rate: 4.041E-06 | global batch size: 2048 | lm loss: 7.558296E+00 | loss scale: 4096.0 | grad norm: 17426.653 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 75/ 292968 | consumed samples: 153600 | consumed tokens: 9830400 | elapsed time per iteration (ms): 105025.5 | learning rate: 4.096E-06 | global batch size: 2048 | lm loss: 7.541232E+00 | loss scale: 4096.0 | grad norm: 21840.480 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 76/ 292968 | consumed samples: 155648 | consumed tokens: 9961472 | elapsed time per iteration (ms): 109478.4 | learning rate: 4.151E-06 | global batch size: 2048 | lm loss: 7.530804E+00 | loss scale: 4096.0 | grad norm: 25625.773 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 77/ 292968 | consumed samples: 157696 | consumed tokens: 10092544 | elapsed time per iteration (ms): 112497.9 | learning rate: 4.205E-06 | global batch size: 2048 | lm loss: 7.539927E+00 | loss scale: 4096.0 | grad norm: 28020.735 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 78/ 292968 | consumed samples: 159744 | consumed tokens: 10223616 | elapsed time per iteration (ms): 108695.6 | learning rate: 4.260E-06 | global batch size: 2048 | lm loss: 7.471020E+00 | loss scale: 4096.0 | grad norm: 21113.718 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 79/ 292968 | consumed samples: 161792 | consumed tokens: 10354688 | elapsed time per iteration (ms): 106184.6 | learning rate: 4.314E-06 | global batch size: 2048 | lm loss: 7.516878E+00 | loss scale: 4096.0 | grad norm: 40563.647 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 80/ 292968 | consumed samples: 163840 | consumed tokens: 10485760 | elapsed time per iteration (ms): 99318.3 | learning rate: 4.369E-06 | global batch size: 2048 | lm loss: 7.473183E+00 | loss scale: 4096.0 | grad norm: 19343.140 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 81/ 292968 | consumed samples: 165888 | consumed tokens: 10616832 | elapsed time per iteration (ms): 98438.8 | learning rate: 4.424E-06 | global batch size: 2048 | lm loss: 7.451110E+00 | loss scale: 4096.0 | grad norm: 18545.691 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 82/ 292968 | consumed samples: 167936 | consumed tokens: 10747904 | elapsed time per iteration (ms): 109868.9 | learning rate: 4.478E-06 | global batch size: 2048 | lm loss: 7.425596E+00 | loss scale: 4096.0 | grad norm: 20873.139 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 83/ 292968 | consumed samples: 169984 | consumed tokens: 10878976 | elapsed time per iteration (ms): 106920.1 | learning rate: 4.533E-06 | global batch size: 2048 | lm loss: 7.426252E+00 | loss scale: 4096.0 | grad norm: 16058.754 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 84/ 292968 | consumed samples: 172032 | consumed tokens: 11010048 | elapsed time per iteration (ms): 102797.2 | learning rate: 4.588E-06 | global batch size: 2048 | lm loss: 7.419496E+00 | loss scale: 4096.0 | grad norm: 30855.532 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 85/ 292968 | consumed samples: 174080 | consumed tokens: 11141120 | elapsed time per iteration (ms): 99891.2 | learning rate: 4.642E-06 | global batch size: 2048 | lm loss: 7.400631E+00 | loss scale: 4096.0 | grad norm: 26228.902 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 86/ 292968 | consumed samples: 176128 | consumed tokens: 11272192 | elapsed time per iteration (ms): 99633.6 | learning rate: 4.697E-06 | global batch size: 2048 | lm loss: 7.362182E+00 | loss scale: 4096.0 | grad norm: 23025.011 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 87/ 292968 | consumed samples: 178176 | consumed tokens: 11403264 | elapsed time per iteration (ms): 99462.1 | learning rate: 4.751E-06 | global batch size: 2048 | lm loss: 7.363019E+00 | loss scale: 4096.0 | grad norm: 20108.364 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 88/ 292968 | consumed samples: 180224 | consumed tokens: 11534336 | elapsed time per iteration (ms): 97499.3 | learning rate: 4.806E-06 | global batch size: 2048 | lm loss: 7.334573E+00 | loss scale: 4096.0 | grad norm: 13027.283 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 89/ 292968 | consumed samples: 182272 | consumed tokens: 11665408 | elapsed time per iteration (ms): 99420.2 | learning rate: 4.861E-06 | global batch size: 2048 | lm loss: 7.349755E+00 | loss scale: 4096.0 | grad norm: 21345.372 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 90/ 292968 | consumed samples: 184320 | consumed tokens: 11796480 | elapsed time per iteration (ms): 99088.1 | learning rate: 4.915E-06 | global batch size: 2048 | lm loss: 7.320138E+00 | loss scale: 4096.0 | grad norm: 23927.098 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 91/ 292968 | consumed samples: 186368 | consumed tokens: 11927552 | elapsed time per iteration (ms): 98601.3 | learning rate: 4.970E-06 | global batch size: 2048 | lm loss: 7.286917E+00 | loss scale: 4096.0 | grad norm: 25027.122 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 92/ 292968 | consumed samples: 188416 | consumed tokens: 12058624 | elapsed time per iteration (ms): 99513.9 | learning rate: 5.024E-06 | global batch size: 2048 | lm loss: 7.326157E+00 | loss scale: 4096.0 | grad norm: 17566.280 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 93/ 292968 | consumed samples: 190464 | consumed tokens: 12189696 | elapsed time per iteration (ms): 98943.8 | learning rate: 5.079E-06 | global batch size: 2048 | lm loss: 7.271961E+00 | loss scale: 4096.0 | grad norm: 18026.157 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 94/ 292968 | consumed samples: 192512 | consumed tokens: 12320768 | elapsed time per iteration (ms): 99490.3 | learning rate: 5.134E-06 | global batch size: 2048 | lm loss: 7.302150E+00 | loss scale: 4096.0 | grad norm: 19841.082 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 95/ 292968 | consumed samples: 194560 | consumed tokens: 12451840 | elapsed time per iteration (ms): 99870.6 | learning rate: 5.188E-06 | global batch size: 2048 | lm loss: 7.301590E+00 | loss scale: 4096.0 | grad norm: 38731.595 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 96/ 292968 | consumed samples: 196608 | consumed tokens: 12582912 | elapsed time per iteration (ms): 98631.9 | learning rate: 5.243E-06 | global batch size: 2048 | lm loss: 7.340685E+00 | loss scale: 4096.0 | grad norm: 26227.612 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 97/ 292968 | consumed samples: 198656 | consumed tokens: 12713984 | elapsed time per iteration (ms): 99342.5 | learning rate: 5.297E-06 | global batch size: 2048 | lm loss: 7.269507E+00 | loss scale: 4096.0 | grad norm: 22830.147 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 98/ 292968 | consumed samples: 200704 | consumed tokens: 12845056 | elapsed time per iteration (ms): 99377.5 | learning rate: 5.352E-06 | global batch size: 2048 | lm loss: 7.387582E+00 | loss scale: 4096.0 | grad norm: 78447.308 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 99/ 292968 | consumed samples: 202752 | consumed tokens: 12976128 | elapsed time per iteration (ms): 97756.8 | learning rate: 5.407E-06 | global batch size: 2048 | lm loss: 7.313226E+00 | loss scale: 4096.0 | grad norm: 35784.828 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 100/ 292968 | consumed samples: 204800 | consumed tokens: 13107200 | elapsed time per iteration (ms): 98491.5 | learning rate: 5.461E-06 | global batch size: 2048 | lm loss: 7.303374E+00 | loss scale: 4096.0 | grad norm: 23264.085 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 101/ 292968 | consumed samples: 206848 | consumed tokens: 13238272 | elapsed time per iteration (ms): 99728.7 | learning rate: 5.516E-06 | global batch size: 2048 | lm loss: 7.290243E+00 | loss scale: 4096.0 | grad norm: 18378.851 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 102/ 292968 | consumed samples: 208896 | consumed tokens: 13369344 | elapsed time per iteration (ms): 100343.5 | learning rate: 5.571E-06 | global batch size: 2048 | lm loss: 7.295276E+00 | loss scale: 4096.0 | grad norm: 22842.996 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 103/ 292968 | consumed samples: 210944 | consumed tokens: 13500416 | elapsed time per iteration (ms): 98869.5 | learning rate: 5.625E-06 | global batch size: 2048 | lm loss: 7.195797E+00 | loss scale: 4096.0 | grad norm: 10681.646 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 104/ 292968 | consumed samples: 212992 | consumed tokens: 13631488 | elapsed time per iteration (ms): 97649.9 | learning rate: 5.680E-06 | global batch size: 2048 | lm loss: 7.314175E+00 | loss scale: 4096.0 | grad norm: 39999.305 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 105/ 292968 | consumed samples: 215040 | consumed tokens: 13762560 | elapsed time per iteration (ms): 99011.7 | learning rate: 5.734E-06 | global batch size: 2048 | lm loss: 7.255686E+00 | loss scale: 4096.0 | grad norm: 27317.798 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 106/ 292968 | consumed samples: 217088 | consumed tokens: 13893632 | elapsed time per iteration (ms): 99138.9 | learning rate: 5.789E-06 | global batch size: 2048 | lm loss: 7.240612E+00 | loss scale: 4096.0 | grad norm: 21889.390 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 107/ 292968 | consumed samples: 219136 | consumed tokens: 14024704 | elapsed time per iteration (ms): 98587.6 | learning rate: 5.844E-06 | global batch size: 2048 | lm loss: 7.217145E+00 | loss scale: 4096.0 | grad norm: 33046.466 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 108/ 292968 | consumed samples: 221184 | consumed tokens: 14155776 | elapsed time per iteration (ms): 99115.2 | learning rate: 5.898E-06 | global batch size: 2048 | lm loss: 7.189927E+00 | loss scale: 4096.0 | grad norm: 13847.408 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 109/ 292968 | consumed samples: 223232 | consumed tokens: 14286848 | elapsed time per iteration (ms): 98972.5 | learning rate: 5.953E-06 | global batch size: 2048 | lm loss: 7.210914E+00 | loss scale: 4096.0 | grad norm: 18010.193 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 110/ 292968 | consumed samples: 225280 | consumed tokens: 14417920 | elapsed time per iteration (ms): 99739.3 | learning rate: 6.007E-06 | global batch size: 2048 | lm loss: 7.188618E+00 | loss scale: 4096.0 | grad norm: 21448.433 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 111/ 292968 | consumed samples: 227328 | consumed tokens: 14548992 | elapsed time per iteration (ms): 98748.2 | learning rate: 6.062E-06 | global batch size: 2048 | lm loss: 7.203728E+00 | loss scale: 4096.0 | grad norm: 21531.101 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 112/ 292968 | consumed samples: 229376 | consumed tokens: 14680064 | elapsed time per iteration (ms): 98809.2 | learning rate: 6.117E-06 | global batch size: 2048 | lm loss: 7.174859E+00 | loss scale: 4096.0 | grad norm: 16447.579 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 113/ 292968 | consumed samples: 231424 | consumed tokens: 14811136 | elapsed time per iteration (ms): 99787.3 | learning rate: 6.171E-06 | global batch size: 2048 | lm loss: 7.165058E+00 | loss scale: 4096.0 | grad norm: 23175.387 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 114/ 292968 | consumed samples: 233472 | consumed tokens: 14942208 | elapsed time per iteration (ms): 98185.5 | learning rate: 6.226E-06 | global batch size: 2048 | lm loss: 7.112910E+00 | loss scale: 4096.0 | grad norm: 15551.220 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 115/ 292968 | consumed samples: 235520 | consumed tokens: 15073280 | elapsed time per iteration (ms): 98562.7 | learning rate: 6.281E-06 | global batch size: 2048 | lm loss: 7.097376E+00 | loss scale: 4096.0 | grad norm: 13778.484 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 116/ 292968 | consumed samples: 237568 | consumed tokens: 15204352 | elapsed time per iteration (ms): 98680.4 | learning rate: 6.335E-06 | global batch size: 2048 | lm loss: 7.116792E+00 | loss scale: 4096.0 | grad norm: 15957.452 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 117/ 292968 | consumed samples: 239616 | consumed tokens: 15335424 | elapsed time per iteration (ms): 98025.6 | learning rate: 6.390E-06 | global batch size: 2048 | lm loss: 7.136622E+00 | loss scale: 4096.0 | grad norm: 17576.968 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 118/ 292968 | consumed samples: 241664 | consumed tokens: 15466496 | elapsed time per iteration (ms): 97903.4 | learning rate: 6.444E-06 | global batch size: 2048 | lm loss: 7.126158E+00 | loss scale: 4096.0 | grad norm: 18609.793 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 119/ 292968 | consumed samples: 243712 | consumed tokens: 15597568 | elapsed time per iteration (ms): 98276.8 | learning rate: 6.499E-06 | global batch size: 2048 | lm loss: 7.055730E+00 | loss scale: 4096.0 | grad norm: 14801.449 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 120/ 292968 | consumed samples: 245760 | consumed tokens: 15728640 | elapsed time per iteration (ms): 100337.4 | learning rate: 6.554E-06 | global batch size: 2048 | lm loss: 7.049195E+00 | loss scale: 4096.0 | grad norm: 12075.465 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 121/ 292968 | consumed samples: 247808 | consumed tokens: 15859712 | elapsed time per iteration (ms): 99564.5 | learning rate: 6.608E-06 | global batch size: 2048 | lm loss: 7.049836E+00 | loss scale: 4096.0 | grad norm: 23579.488 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 122/ 292968 | consumed samples: 249856 | consumed tokens: 15990784 | elapsed time per iteration (ms): 99012.8 | learning rate: 6.663E-06 | global batch size: 2048 | lm loss: 7.102861E+00 | loss scale: 4096.0 | grad norm: 17888.938 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 123/ 292968 | consumed samples: 251904 | consumed tokens: 16121856 | elapsed time per iteration (ms): 98770.4 | learning rate: 6.717E-06 | global batch size: 2048 | lm loss: 7.046860E+00 | loss scale: 4096.0 | grad norm: 12145.704 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 124/ 292968 | consumed samples: 253952 | consumed tokens: 16252928 | elapsed time per iteration (ms): 99153.8 | learning rate: 6.772E-06 | global batch size: 2048 | lm loss: 7.063597E+00 | loss scale: 4096.0 | grad norm: 26453.256 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 125/ 292968 | consumed samples: 256000 | consumed tokens: 16384000 | elapsed time per iteration (ms): 99915.1 | learning rate: 6.827E-06 | global batch size: 2048 | lm loss: 7.038830E+00 | loss scale: 4096.0 | grad norm: 17982.078 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 126/ 292968 | consumed samples: 258048 | consumed tokens: 16515072 | elapsed time per iteration (ms): 98120.4 | learning rate: 6.881E-06 | global batch size: 2048 | lm loss: 7.023058E+00 | loss scale: 4096.0 | grad norm: 11733.913 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 127/ 292968 | consumed samples: 260096 | consumed tokens: 16646144 | elapsed time per iteration (ms): 99193.3 | learning rate: 6.936E-06 | global batch size: 2048 | lm loss: 7.011484E+00 | loss scale: 4096.0 | grad norm: 18411.489 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 128/ 292968 | consumed samples: 262144 | consumed tokens: 16777216 | elapsed time per iteration (ms): 100353.5 | learning rate: 6.991E-06 | global batch size: 2048 | lm loss: 7.036419E+00 | loss scale: 4096.0 | grad norm: 12826.008 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 129/ 292968 | consumed samples: 264192 | consumed tokens: 16908288 | elapsed time per iteration (ms): 98689.6 | learning rate: 7.045E-06 | global batch size: 2048 | lm loss: 7.056478E+00 | loss scale: 4096.0 | grad norm: 50083.305 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 130/ 292968 | consumed samples: 266240 | consumed tokens: 17039360 | elapsed time per iteration (ms): 99876.4 | learning rate: 7.100E-06 | global batch size: 2048 | lm loss: 7.064220E+00 | loss scale: 4096.0 | grad norm: 18187.103 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 131/ 292968 | consumed samples: 268288 | consumed tokens: 17170432 | elapsed time per iteration (ms): 99172.6 | learning rate: 7.154E-06 | global batch size: 2048 | lm loss: 6.996428E+00 | loss scale: 4096.0 | grad norm: 18931.627 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 132/ 292968 | consumed samples: 270336 | consumed tokens: 17301504 | elapsed time per iteration (ms): 98583.5 | learning rate: 7.209E-06 | global batch size: 2048 | lm loss: 7.034263E+00 | loss scale: 4096.0 | grad norm: 25524.369 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 133/ 292968 | consumed samples: 272384 | consumed tokens: 17432576 | elapsed time per iteration (ms): 98388.2 | learning rate: 7.264E-06 | global batch size: 2048 | lm loss: 7.035317E+00 | loss scale: 4096.0 | grad norm: 27887.356 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 134/ 292968 | consumed samples: 274432 | consumed tokens: 17563648 | elapsed time per iteration (ms): 100148.0 | learning rate: 7.318E-06 | global batch size: 2048 | lm loss: 7.054586E+00 | loss scale: 4096.0 | grad norm: 17295.833 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 135/ 292968 | consumed samples: 276480 | consumed tokens: 17694720 | elapsed time per iteration (ms): 99122.8 | learning rate: 7.373E-06 | global batch size: 2048 | lm loss: 6.986097E+00 | loss scale: 4096.0 | grad norm: 13290.042 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 136/ 292968 | consumed samples: 278528 | consumed tokens: 17825792 | elapsed time per iteration (ms): 97064.0 | learning rate: 7.427E-06 | global batch size: 2048 | lm loss: 6.986552E+00 | loss scale: 4096.0 | grad norm: 19030.757 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 137/ 292968 | consumed samples: 280576 | consumed tokens: 17956864 | elapsed time per iteration (ms): 99764.4 | learning rate: 7.482E-06 | global batch size: 2048 | lm loss: 6.966130E+00 | loss scale: 4096.0 | grad norm: 21112.496 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 138/ 292968 | consumed samples: 282624 | consumed tokens: 18087936 | elapsed time per iteration (ms): 100485.4 | learning rate: 7.537E-06 | global batch size: 2048 | lm loss: 7.003498E+00 | loss scale: 4096.0 | grad norm: 22959.252 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 139/ 292968 | consumed samples: 284672 | consumed tokens: 18219008 | elapsed time per iteration (ms): 98444.8 | learning rate: 7.591E-06 | global batch size: 2048 | lm loss: 6.956960E+00 | loss scale: 4096.0 | grad norm: 14848.931 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 140/ 292968 | consumed samples: 286720 | consumed tokens: 18350080 | elapsed time per iteration (ms): 98605.0 | learning rate: 7.646E-06 | global batch size: 2048 | lm loss: 6.967386E+00 | loss scale: 4096.0 | grad norm: 28957.517 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 141/ 292968 | consumed samples: 288768 | consumed tokens: 18481152 | elapsed time per iteration (ms): 99201.4 | learning rate: 7.700E-06 | global batch size: 2048 | lm loss: 6.964898E+00 | loss scale: 4096.0 | grad norm: 15531.157 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 142/ 292968 | consumed samples: 290816 | consumed tokens: 18612224 | elapsed time per iteration (ms): 100186.1 | learning rate: 7.755E-06 | global batch size: 2048 | lm loss: 6.913935E+00 | loss scale: 4096.0 | grad norm: 16348.702 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 143/ 292968 | consumed samples: 292864 | consumed tokens: 18743296 | elapsed time per iteration (ms): 98185.2 | learning rate: 7.810E-06 | global batch size: 2048 | lm loss: 6.908429E+00 | loss scale: 4096.0 | grad norm: 13650.003 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 144/ 292968 | consumed samples: 294912 | consumed tokens: 18874368 | elapsed time per iteration (ms): 98287.8 | learning rate: 7.864E-06 | global batch size: 2048 | lm loss: 6.903642E+00 | loss scale: 4096.0 | grad norm: 13527.458 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 145/ 292968 | consumed samples: 296960 | consumed tokens: 19005440 | elapsed time per iteration (ms): 97323.5 | learning rate: 7.919E-06 | global batch size: 2048 | lm loss: 6.899990E+00 | loss scale: 4096.0 | grad norm: 19259.466 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 146/ 292968 | consumed samples: 299008 | consumed tokens: 19152896 | elapsed time per iteration (ms): 107806.0 | learning rate: 7.974E-06 | global batch size: 2048 | lm loss: 6.952594E+00 | loss scale: 4096.0 | grad norm: 15578.806 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 147/ 292968 | consumed samples: 301056 | consumed tokens: 19300352 | elapsed time per iteration (ms): 106755.9 | learning rate: 8.028E-06 | global batch size: 2048 | lm loss: 6.939005E+00 | loss scale: 4096.0 | grad norm: 22596.573 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 148/ 292968 | consumed samples: 303104 | consumed tokens: 19447808 | elapsed time per iteration (ms): 108334.5 | learning rate: 8.083E-06 | global batch size: 2048 | lm loss: 6.928869E+00 | loss scale: 4096.0 | grad norm: 13170.488 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 149/ 292968 | consumed samples: 305152 | consumed tokens: 19595264 | elapsed time per iteration (ms): 109740.4 | learning rate: 8.137E-06 | global batch size: 2048 | lm loss: 6.907570E+00 | loss scale: 4096.0 | grad norm: 20796.844 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 150/ 292968 | consumed samples: 307200 | consumed tokens: 19742720 | elapsed time per iteration (ms): 111136.1 | learning rate: 8.192E-06 | global batch size: 2048 | lm loss: 6.910664E+00 | loss scale: 4096.0 | grad norm: 24805.638 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------ - validation loss at iteration 150 | lm loss value: 6.911995E+00 | lm loss PPL: 1.004249E+03 | ------------------------------------------------------------------------------------------------ - iteration 151/ 292968 | consumed samples: 309248 | consumed tokens: 19890176 | elapsed time per iteration (ms): 314384.7 | learning rate: 8.247E-06 | global batch size: 2048 | lm loss: 6.927706E+00 | loss scale: 4096.0 | grad norm: 16620.224 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 152/ 292968 | consumed samples: 311296 | consumed tokens: 20037632 | elapsed time per iteration (ms): 108974.5 | learning rate: 8.301E-06 | global batch size: 2048 | lm loss: 6.903427E+00 | loss scale: 4096.0 | grad norm: 13701.564 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 153/ 292968 | consumed samples: 313344 | consumed tokens: 20185088 | elapsed time per iteration (ms): 110876.4 | learning rate: 8.356E-06 | global batch size: 2048 | lm loss: 6.847519E+00 | loss scale: 4096.0 | grad norm: 9037.545 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 154/ 292968 | consumed samples: 315392 | consumed tokens: 20332544 | elapsed time per iteration (ms): 108091.5 | learning rate: 8.410E-06 | global batch size: 2048 | lm loss: 6.870549E+00 | loss scale: 4096.0 | grad norm: 21755.013 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 155/ 292968 | consumed samples: 317440 | consumed tokens: 20480000 | elapsed time per iteration (ms): 109902.4 | learning rate: 8.465E-06 | global batch size: 2048 | lm loss: 6.831274E+00 | loss scale: 4096.0 | grad norm: 13835.802 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 156/ 292968 | consumed samples: 319488 | consumed tokens: 20627456 | elapsed time per iteration (ms): 109792.2 | learning rate: 8.520E-06 | global batch size: 2048 | lm loss: 6.868259E+00 | loss scale: 4096.0 | grad norm: 27263.731 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 157/ 292968 | consumed samples: 321536 | consumed tokens: 20774912 | elapsed time per iteration (ms): 109874.5 | learning rate: 8.574E-06 | global batch size: 2048 | lm loss: 6.899713E+00 | loss scale: 4096.0 | grad norm: 13148.958 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 158/ 292968 | consumed samples: 323584 | consumed tokens: 20922368 | elapsed time per iteration (ms): 108993.5 | learning rate: 8.629E-06 | global batch size: 2048 | lm loss: 6.920228E+00 | loss scale: 4096.0 | grad norm: 23212.972 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 159/ 292968 | consumed samples: 325632 | consumed tokens: 21069824 | elapsed time per iteration (ms): 109216.8 | learning rate: 8.684E-06 | global batch size: 2048 | lm loss: 6.888138E+00 | loss scale: 4096.0 | grad norm: 19877.359 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 160/ 292968 | consumed samples: 327680 | consumed tokens: 21217280 | elapsed time per iteration (ms): 106681.3 | learning rate: 8.738E-06 | global batch size: 2048 | lm loss: 6.874300E+00 | loss scale: 4096.0 | grad norm: 16758.440 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 161/ 292968 | consumed samples: 329728 | consumed tokens: 21364736 | elapsed time per iteration (ms): 108747.5 | learning rate: 8.793E-06 | global batch size: 2048 | lm loss: 6.848676E+00 | loss scale: 4096.0 | grad norm: 15132.008 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 162/ 292968 | consumed samples: 331776 | consumed tokens: 21512192 | elapsed time per iteration (ms): 109505.8 | learning rate: 8.847E-06 | global batch size: 2048 | lm loss: 6.838581E+00 | loss scale: 4096.0 | grad norm: 15975.375 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 163/ 292968 | consumed samples: 333824 | consumed tokens: 21659648 | elapsed time per iteration (ms): 109443.7 | learning rate: 8.902E-06 | global batch size: 2048 | lm loss: 6.816732E+00 | loss scale: 4096.0 | grad norm: 12297.865 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 164/ 292968 | consumed samples: 335872 | consumed tokens: 21807104 | elapsed time per iteration (ms): 109315.4 | learning rate: 8.957E-06 | global batch size: 2048 | lm loss: 6.810020E+00 | loss scale: 4096.0 | grad norm: 13808.706 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 165/ 292968 | consumed samples: 337920 | consumed tokens: 21954560 | elapsed time per iteration (ms): 110133.0 | learning rate: 9.011E-06 | global batch size: 2048 | lm loss: 6.785074E+00 | loss scale: 4096.0 | grad norm: 12462.032 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 166/ 292968 | consumed samples: 339968 | consumed tokens: 22102016 | elapsed time per iteration (ms): 109032.2 | learning rate: 9.066E-06 | global batch size: 2048 | lm loss: 6.819090E+00 | loss scale: 4096.0 | grad norm: 17466.047 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 167/ 292968 | consumed samples: 342016 | consumed tokens: 22249472 | elapsed time per iteration (ms): 108953.2 | learning rate: 9.120E-06 | global batch size: 2048 | lm loss: 6.784965E+00 | loss scale: 4096.0 | grad norm: 14037.632 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 168/ 292968 | consumed samples: 344064 | consumed tokens: 22396928 | elapsed time per iteration (ms): 109361.5 | learning rate: 9.175E-06 | global batch size: 2048 | lm loss: 6.823694E+00 | loss scale: 4096.0 | grad norm: 37452.133 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 169/ 292968 | consumed samples: 346112 | consumed tokens: 22544384 | elapsed time per iteration (ms): 108853.0 | learning rate: 9.230E-06 | global batch size: 2048 | lm loss: 6.820905E+00 | loss scale: 4096.0 | grad norm: 13290.574 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 170/ 292968 | consumed samples: 348160 | consumed tokens: 22691840 | elapsed time per iteration (ms): 109163.3 | learning rate: 9.284E-06 | global batch size: 2048 | lm loss: 6.785219E+00 | loss scale: 4096.0 | grad norm: 14191.572 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 171/ 292968 | consumed samples: 350208 | consumed tokens: 22839296 | elapsed time per iteration (ms): 109531.1 | learning rate: 9.339E-06 | global batch size: 2048 | lm loss: 6.760223E+00 | loss scale: 4096.0 | grad norm: 16079.621 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 172/ 292968 | consumed samples: 352256 | consumed tokens: 22986752 | elapsed time per iteration (ms): 108022.7 | learning rate: 9.393E-06 | global batch size: 2048 | lm loss: 6.744514E+00 | loss scale: 4096.0 | grad norm: 24216.358 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 173/ 292968 | consumed samples: 354304 | consumed tokens: 23134208 | elapsed time per iteration (ms): 107994.4 | learning rate: 9.448E-06 | global batch size: 2048 | lm loss: 6.764698E+00 | loss scale: 4096.0 | grad norm: 13868.582 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 174/ 292968 | consumed samples: 356352 | consumed tokens: 23281664 | elapsed time per iteration (ms): 109272.2 | learning rate: 9.503E-06 | global batch size: 2048 | lm loss: 6.738492E+00 | loss scale: 4096.0 | grad norm: 17560.117 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 175/ 292968 | consumed samples: 358400 | consumed tokens: 23429120 | elapsed time per iteration (ms): 109540.2 | learning rate: 9.557E-06 | global batch size: 2048 | lm loss: 6.742512E+00 | loss scale: 4096.0 | grad norm: 13064.055 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 176/ 292968 | consumed samples: 360448 | consumed tokens: 23576576 | elapsed time per iteration (ms): 110011.0 | learning rate: 9.612E-06 | global batch size: 2048 | lm loss: 6.769942E+00 | loss scale: 4096.0 | grad norm: 13317.601 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 177/ 292968 | consumed samples: 362496 | consumed tokens: 23724032 | elapsed time per iteration (ms): 109418.0 | learning rate: 9.667E-06 | global batch size: 2048 | lm loss: 6.726838E+00 | loss scale: 4096.0 | grad norm: 19210.414 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 178/ 292968 | consumed samples: 364544 | consumed tokens: 23871488 | elapsed time per iteration (ms): 108638.8 | learning rate: 9.721E-06 | global batch size: 2048 | lm loss: 6.725516E+00 | loss scale: 4096.0 | grad norm: 11652.375 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 179/ 292968 | consumed samples: 366592 | consumed tokens: 24018944 | elapsed time per iteration (ms): 109675.2 | learning rate: 9.776E-06 | global batch size: 2048 | lm loss: 6.718335E+00 | loss scale: 4096.0 | grad norm: 10500.907 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 180/ 292968 | consumed samples: 368640 | consumed tokens: 24166400 | elapsed time per iteration (ms): 108484.3 | learning rate: 9.830E-06 | global batch size: 2048 | lm loss: 6.698410E+00 | loss scale: 4096.0 | grad norm: 13786.060 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 181/ 292968 | consumed samples: 370688 | consumed tokens: 24313856 | elapsed time per iteration (ms): 109435.3 | learning rate: 9.885E-06 | global batch size: 2048 | lm loss: 6.687134E+00 | loss scale: 4096.0 | grad norm: 12244.639 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 182/ 292968 | consumed samples: 372736 | consumed tokens: 24461312 | elapsed time per iteration (ms): 108150.9 | learning rate: 9.940E-06 | global batch size: 2048 | lm loss: 6.692582E+00 | loss scale: 4096.0 | grad norm: 12113.509 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 183/ 292968 | consumed samples: 374784 | consumed tokens: 24608768 | elapsed time per iteration (ms): 108319.9 | learning rate: 9.994E-06 | global batch size: 2048 | lm loss: 6.730206E+00 | loss scale: 4096.0 | grad norm: 18876.822 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 184/ 292968 | consumed samples: 376832 | consumed tokens: 24756224 | elapsed time per iteration (ms): 110981.5 | learning rate: 1.005E-05 | global batch size: 2048 | lm loss: 6.712937E+00 | loss scale: 4096.0 | grad norm: 10725.498 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 185/ 292968 | consumed samples: 378880 | consumed tokens: 24903680 | elapsed time per iteration (ms): 108264.6 | learning rate: 1.010E-05 | global batch size: 2048 | lm loss: 6.659677E+00 | loss scale: 4096.0 | grad norm: 9318.050 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 186/ 292968 | consumed samples: 380928 | consumed tokens: 25051136 | elapsed time per iteration (ms): 110629.4 | learning rate: 1.016E-05 | global batch size: 2048 | lm loss: 6.691420E+00 | loss scale: 4096.0 | grad norm: 17660.429 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 187/ 292968 | consumed samples: 382976 | consumed tokens: 25198592 | elapsed time per iteration (ms): 108581.6 | learning rate: 1.021E-05 | global batch size: 2048 | lm loss: 6.690703E+00 | loss scale: 4096.0 | grad norm: 13805.891 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 188/ 292968 | consumed samples: 385024 | consumed tokens: 25346048 | elapsed time per iteration (ms): 109141.2 | learning rate: 1.027E-05 | global batch size: 2048 | lm loss: 6.678379E+00 | loss scale: 4096.0 | grad norm: 10400.606 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 189/ 292968 | consumed samples: 387072 | consumed tokens: 25493504 | elapsed time per iteration (ms): 109256.9 | learning rate: 1.032E-05 | global batch size: 2048 | lm loss: 6.724946E+00 | loss scale: 4096.0 | grad norm: 26447.588 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 190/ 292968 | consumed samples: 389120 | consumed tokens: 25640960 | elapsed time per iteration (ms): 108409.5 | learning rate: 1.038E-05 | global batch size: 2048 | lm loss: 6.720017E+00 | loss scale: 4096.0 | grad norm: 18958.479 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 191/ 292968 | consumed samples: 391168 | consumed tokens: 25788416 | elapsed time per iteration (ms): 111186.5 | learning rate: 1.043E-05 | global batch size: 2048 | lm loss: 6.727012E+00 | loss scale: 4096.0 | grad norm: 14026.941 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 192/ 292968 | consumed samples: 393216 | consumed tokens: 25935872 | elapsed time per iteration (ms): 108789.6 | learning rate: 1.049E-05 | global batch size: 2048 | lm loss: 6.711470E+00 | loss scale: 4096.0 | grad norm: 12658.672 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 193/ 292968 | consumed samples: 395264 | consumed tokens: 26083328 | elapsed time per iteration (ms): 109623.3 | learning rate: 1.054E-05 | global batch size: 2048 | lm loss: 6.681795E+00 | loss scale: 4096.0 | grad norm: 16106.022 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 194/ 292968 | consumed samples: 397312 | consumed tokens: 26230784 | elapsed time per iteration (ms): 108407.6 | learning rate: 1.059E-05 | global batch size: 2048 | lm loss: 6.693110E+00 | loss scale: 4096.0 | grad norm: 13351.057 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 195/ 292968 | consumed samples: 399360 | consumed tokens: 26378240 | elapsed time per iteration (ms): 108247.6 | learning rate: 1.065E-05 | global batch size: 2048 | lm loss: 6.647738E+00 | loss scale: 4096.0 | grad norm: 11189.695 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 196/ 292968 | consumed samples: 401408 | consumed tokens: 26525696 | elapsed time per iteration (ms): 110072.0 | learning rate: 1.070E-05 | global batch size: 2048 | lm loss: 6.649861E+00 | loss scale: 4096.0 | grad norm: 18856.375 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 197/ 292968 | consumed samples: 403456 | consumed tokens: 26673152 | elapsed time per iteration (ms): 108782.0 | learning rate: 1.076E-05 | global batch size: 2048 | lm loss: 6.688879E+00 | loss scale: 4096.0 | grad norm: 12075.172 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 198/ 292968 | consumed samples: 405504 | consumed tokens: 26820608 | elapsed time per iteration (ms): 109766.0 | learning rate: 1.081E-05 | global batch size: 2048 | lm loss: 6.667139E+00 | loss scale: 4096.0 | grad norm: 12386.103 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 199/ 292968 | consumed samples: 407552 | consumed tokens: 26968064 | elapsed time per iteration (ms): 109105.3 | learning rate: 1.087E-05 | global batch size: 2048 | lm loss: 6.642043E+00 | loss scale: 4096.0 | grad norm: 9685.123 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 200/ 292968 | consumed samples: 409600 | consumed tokens: 27115520 | elapsed time per iteration (ms): 109241.2 | learning rate: 1.092E-05 | global batch size: 2048 | lm loss: 6.609816E+00 | loss scale: 4096.0 | grad norm: 13032.741 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 201/ 292968 | consumed samples: 411648 | consumed tokens: 27262976 | elapsed time per iteration (ms): 108950.1 | learning rate: 1.098E-05 | global batch size: 2048 | lm loss: 6.591060E+00 | loss scale: 4096.0 | grad norm: 16272.868 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 202/ 292968 | consumed samples: 413696 | consumed tokens: 27410432 | elapsed time per iteration (ms): 108869.4 | learning rate: 1.103E-05 | global batch size: 2048 | lm loss: 6.630847E+00 | loss scale: 4096.0 | grad norm: 13740.367 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 203/ 292968 | consumed samples: 415744 | consumed tokens: 27557888 | elapsed time per iteration (ms): 110939.9 | learning rate: 1.109E-05 | global batch size: 2048 | lm loss: 6.622442E+00 | loss scale: 4096.0 | grad norm: 23543.715 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 204/ 292968 | consumed samples: 417792 | consumed tokens: 27705344 | elapsed time per iteration (ms): 108469.9 | learning rate: 1.114E-05 | global batch size: 2048 | lm loss: 6.601192E+00 | loss scale: 4096.0 | grad norm: 14236.997 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 205/ 292968 | consumed samples: 419840 | consumed tokens: 27852800 | elapsed time per iteration (ms): 108762.0 | learning rate: 1.120E-05 | global batch size: 2048 | lm loss: 6.620346E+00 | loss scale: 4096.0 | grad norm: 9273.302 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 206/ 292968 | consumed samples: 421888 | consumed tokens: 28000256 | elapsed time per iteration (ms): 108774.9 | learning rate: 1.125E-05 | global batch size: 2048 | lm loss: 6.663992E+00 | loss scale: 4096.0 | grad norm: 34656.545 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 207/ 292968 | consumed samples: 423936 | consumed tokens: 28147712 | elapsed time per iteration (ms): 109473.7 | learning rate: 1.130E-05 | global batch size: 2048 | lm loss: 6.631517E+00 | loss scale: 4096.0 | grad norm: 17243.316 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 208/ 292968 | consumed samples: 425984 | consumed tokens: 28295168 | elapsed time per iteration (ms): 109185.4 | learning rate: 1.136E-05 | global batch size: 2048 | lm loss: 6.636267E+00 | loss scale: 4096.0 | grad norm: 13673.636 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 209/ 292968 | consumed samples: 428032 | consumed tokens: 28442624 | elapsed time per iteration (ms): 108221.0 | learning rate: 1.141E-05 | global batch size: 2048 | lm loss: 6.616088E+00 | loss scale: 4096.0 | grad norm: 13025.014 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 210/ 292968 | consumed samples: 430080 | consumed tokens: 28590080 | elapsed time per iteration (ms): 109925.7 | learning rate: 1.147E-05 | global batch size: 2048 | lm loss: 6.664887E+00 | loss scale: 4096.0 | grad norm: 17361.344 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 211/ 292968 | consumed samples: 432128 | consumed tokens: 28737536 | elapsed time per iteration (ms): 108592.2 | learning rate: 1.152E-05 | global batch size: 2048 | lm loss: 6.596425E+00 | loss scale: 4096.0 | grad norm: 11002.662 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 212/ 292968 | consumed samples: 434176 | consumed tokens: 28884992 | elapsed time per iteration (ms): 108393.6 | learning rate: 1.158E-05 | global batch size: 2048 | lm loss: 6.619356E+00 | loss scale: 4096.0 | grad norm: 15912.693 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 213/ 292968 | consumed samples: 436224 | consumed tokens: 29032448 | elapsed time per iteration (ms): 109338.8 | learning rate: 1.163E-05 | global batch size: 2048 | lm loss: 6.599881E+00 | loss scale: 4096.0 | grad norm: 11826.809 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 214/ 292968 | consumed samples: 438272 | consumed tokens: 29179904 | elapsed time per iteration (ms): 108881.3 | learning rate: 1.169E-05 | global batch size: 2048 | lm loss: 6.568992E+00 | loss scale: 4096.0 | grad norm: 8395.689 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 215/ 292968 | consumed samples: 440320 | consumed tokens: 29327360 | elapsed time per iteration (ms): 109721.8 | learning rate: 1.174E-05 | global batch size: 2048 | lm loss: 6.542880E+00 | loss scale: 4096.0 | grad norm: 9000.265 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 216/ 292968 | consumed samples: 442368 | consumed tokens: 29474816 | elapsed time per iteration (ms): 108053.4 | learning rate: 1.180E-05 | global batch size: 2048 | lm loss: 6.572178E+00 | loss scale: 4096.0 | grad norm: 11927.749 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 217/ 292968 | consumed samples: 444416 | consumed tokens: 29622272 | elapsed time per iteration (ms): 109847.9 | learning rate: 1.185E-05 | global batch size: 2048 | lm loss: 6.566045E+00 | loss scale: 4096.0 | grad norm: 10303.251 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 218/ 292968 | consumed samples: 446464 | consumed tokens: 29769728 | elapsed time per iteration (ms): 109033.4 | learning rate: 1.191E-05 | global batch size: 2048 | lm loss: 6.564643E+00 | loss scale: 4096.0 | grad norm: 13959.244 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 219/ 292968 | consumed samples: 448512 | consumed tokens: 29917184 | elapsed time per iteration (ms): 108967.9 | learning rate: 1.196E-05 | global batch size: 2048 | lm loss: 6.564982E+00 | loss scale: 4096.0 | grad norm: 10680.202 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 220/ 292968 | consumed samples: 450560 | consumed tokens: 30064640 | elapsed time per iteration (ms): 108753.5 | learning rate: 1.201E-05 | global batch size: 2048 | lm loss: 6.549003E+00 | loss scale: 4096.0 | grad norm: 11329.565 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 221/ 292968 | consumed samples: 452608 | consumed tokens: 30212096 | elapsed time per iteration (ms): 108436.4 | learning rate: 1.207E-05 | global batch size: 2048 | lm loss: 6.569693E+00 | loss scale: 4096.0 | grad norm: 10997.802 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 222/ 292968 | consumed samples: 454656 | consumed tokens: 30359552 | elapsed time per iteration (ms): 107874.9 | learning rate: 1.212E-05 | global batch size: 2048 | lm loss: 6.517329E+00 | loss scale: 4096.0 | grad norm: 7876.751 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 223/ 292968 | consumed samples: 456704 | consumed tokens: 30507008 | elapsed time per iteration (ms): 108015.7 | learning rate: 1.218E-05 | global batch size: 2048 | lm loss: 6.522130E+00 | loss scale: 4096.0 | grad norm: 16113.010 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 224/ 292968 | consumed samples: 458752 | consumed tokens: 30654464 | elapsed time per iteration (ms): 109946.7 | learning rate: 1.223E-05 | global batch size: 2048 | lm loss: 6.532452E+00 | loss scale: 4096.0 | grad norm: 11770.817 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 225/ 292968 | consumed samples: 460800 | consumed tokens: 30801920 | elapsed time per iteration (ms): 109915.9 | learning rate: 1.229E-05 | global batch size: 2048 | lm loss: 6.518247E+00 | loss scale: 4096.0 | grad norm: 10109.630 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 226/ 292968 | consumed samples: 462848 | consumed tokens: 30949376 | elapsed time per iteration (ms): 109913.5 | learning rate: 1.234E-05 | global batch size: 2048 | lm loss: 6.528529E+00 | loss scale: 4096.0 | grad norm: 13449.174 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 227/ 292968 | consumed samples: 464896 | consumed tokens: 31096832 | elapsed time per iteration (ms): 108450.5 | learning rate: 1.240E-05 | global batch size: 2048 | lm loss: 6.521327E+00 | loss scale: 4096.0 | grad norm: 13044.262 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 228/ 292968 | consumed samples: 466944 | consumed tokens: 31244288 | elapsed time per iteration (ms): 108330.5 | learning rate: 1.245E-05 | global batch size: 2048 | lm loss: 6.482043E+00 | loss scale: 4096.0 | grad norm: 6327.952 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 229/ 292968 | consumed samples: 468992 | consumed tokens: 31391744 | elapsed time per iteration (ms): 107514.7 | learning rate: 1.251E-05 | global batch size: 2048 | lm loss: 6.525314E+00 | loss scale: 4096.0 | grad norm: 24079.390 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 230/ 292968 | consumed samples: 471040 | consumed tokens: 31539200 | elapsed time per iteration (ms): 111113.0 | learning rate: 1.256E-05 | global batch size: 2048 | lm loss: 6.623558E+00 | loss scale: 4096.0 | grad norm: 13173.067 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 231/ 292968 | consumed samples: 473088 | consumed tokens: 31686656 | elapsed time per iteration (ms): 108591.5 | learning rate: 1.262E-05 | global batch size: 2048 | lm loss: 6.527527E+00 | loss scale: 4096.0 | grad norm: 10151.047 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 232/ 292968 | consumed samples: 475136 | consumed tokens: 31834112 | elapsed time per iteration (ms): 110404.9 | learning rate: 1.267E-05 | global batch size: 2048 | lm loss: 6.556199E+00 | loss scale: 4096.0 | grad norm: 17483.376 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 233/ 292968 | consumed samples: 477184 | consumed tokens: 31981568 | elapsed time per iteration (ms): 109869.8 | learning rate: 1.272E-05 | global batch size: 2048 | lm loss: 6.514931E+00 | loss scale: 4096.0 | grad norm: 8096.373 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 234/ 292968 | consumed samples: 479232 | consumed tokens: 32129024 | elapsed time per iteration (ms): 110017.4 | learning rate: 1.278E-05 | global batch size: 2048 | lm loss: 6.518210E+00 | loss scale: 4096.0 | grad norm: 11606.961 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 235/ 292968 | consumed samples: 481280 | consumed tokens: 32276480 | elapsed time per iteration (ms): 109440.7 | learning rate: 1.283E-05 | global batch size: 2048 | lm loss: 6.498292E+00 | loss scale: 4096.0 | grad norm: 9005.038 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 236/ 292968 | consumed samples: 483328 | consumed tokens: 32423936 | elapsed time per iteration (ms): 109990.9 | learning rate: 1.289E-05 | global batch size: 2048 | lm loss: 6.525797E+00 | loss scale: 4096.0 | grad norm: 12458.174 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 237/ 292968 | consumed samples: 485376 | consumed tokens: 32571392 | elapsed time per iteration (ms): 108841.8 | learning rate: 1.294E-05 | global batch size: 2048 | lm loss: 6.490116E+00 | loss scale: 4096.0 | grad norm: 10265.911 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 238/ 292968 | consumed samples: 487424 | consumed tokens: 32718848 | elapsed time per iteration (ms): 109491.3 | learning rate: 1.300E-05 | global batch size: 2048 | lm loss: 6.474614E+00 | loss scale: 4096.0 | grad norm: 10958.496 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 239/ 292968 | consumed samples: 489472 | consumed tokens: 32866304 | elapsed time per iteration (ms): 107974.5 | learning rate: 1.305E-05 | global batch size: 2048 | lm loss: 6.506901E+00 | loss scale: 4096.0 | grad norm: 11294.935 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 240/ 292968 | consumed samples: 491520 | consumed tokens: 33013760 | elapsed time per iteration (ms): 107137.9 | learning rate: 1.311E-05 | global batch size: 2048 | lm loss: 6.472748E+00 | loss scale: 4096.0 | grad norm: 9739.804 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 241/ 292968 | consumed samples: 493568 | consumed tokens: 33161216 | elapsed time per iteration (ms): 109980.8 | learning rate: 1.316E-05 | global batch size: 2048 | lm loss: 6.455049E+00 | loss scale: 4096.0 | grad norm: 12494.447 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 242/ 292968 | consumed samples: 495616 | consumed tokens: 33308672 | elapsed time per iteration (ms): 107918.1 | learning rate: 1.322E-05 | global batch size: 2048 | lm loss: 6.493991E+00 | loss scale: 4096.0 | grad norm: 12065.325 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 243/ 292968 | consumed samples: 497664 | consumed tokens: 33456128 | elapsed time per iteration (ms): 107653.9 | learning rate: 1.327E-05 | global batch size: 2048 | lm loss: 6.458516E+00 | loss scale: 4096.0 | grad norm: 6746.326 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 244/ 292968 | consumed samples: 499712 | consumed tokens: 33603584 | elapsed time per iteration (ms): 108841.2 | learning rate: 1.333E-05 | global batch size: 2048 | lm loss: 6.454665E+00 | loss scale: 4096.0 | grad norm: 20224.532 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 245/ 292968 | consumed samples: 501760 | consumed tokens: 33751040 | elapsed time per iteration (ms): 109534.9 | learning rate: 1.338E-05 | global batch size: 2048 | lm loss: 6.475075E+00 | loss scale: 4096.0 | grad norm: 11690.787 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 246/ 292968 | consumed samples: 503808 | consumed tokens: 33898496 | elapsed time per iteration (ms): 109262.0 | learning rate: 1.343E-05 | global batch size: 2048 | lm loss: 6.457047E+00 | loss scale: 4096.0 | grad norm: 11788.945 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 247/ 292968 | consumed samples: 505856 | consumed tokens: 34045952 | elapsed time per iteration (ms): 109793.7 | learning rate: 1.349E-05 | global batch size: 2048 | lm loss: 6.448865E+00 | loss scale: 4096.0 | grad norm: 8746.236 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 248/ 292968 | consumed samples: 507904 | consumed tokens: 34193408 | elapsed time per iteration (ms): 111125.9 | learning rate: 1.354E-05 | global batch size: 2048 | lm loss: 6.451093E+00 | loss scale: 4096.0 | grad norm: 7669.336 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 249/ 292968 | consumed samples: 509952 | consumed tokens: 34340864 | elapsed time per iteration (ms): 108866.1 | learning rate: 1.360E-05 | global batch size: 2048 | lm loss: 6.460510E+00 | loss scale: 4096.0 | grad norm: 11032.057 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 250/ 292968 | consumed samples: 512000 | consumed tokens: 34488320 | elapsed time per iteration (ms): 107737.4 | learning rate: 1.365E-05 | global batch size: 2048 | lm loss: 6.444838E+00 | loss scale: 4096.0 | grad norm: 10519.254 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 251/ 292968 | consumed samples: 514048 | consumed tokens: 34635776 | elapsed time per iteration (ms): 109650.5 | learning rate: 1.371E-05 | global batch size: 2048 | lm loss: 6.447746E+00 | loss scale: 4096.0 | grad norm: 13883.440 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 252/ 292968 | consumed samples: 516096 | consumed tokens: 34783232 | elapsed time per iteration (ms): 108650.4 | learning rate: 1.376E-05 | global batch size: 2048 | lm loss: 6.411553E+00 | loss scale: 4096.0 | grad norm: 8276.113 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 253/ 292968 | consumed samples: 518144 | consumed tokens: 34930688 | elapsed time per iteration (ms): 109729.3 | learning rate: 1.382E-05 | global batch size: 2048 | lm loss: 6.445526E+00 | loss scale: 4096.0 | grad norm: 20950.135 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 254/ 292968 | consumed samples: 520192 | consumed tokens: 35078144 | elapsed time per iteration (ms): 107887.4 | learning rate: 1.387E-05 | global batch size: 2048 | lm loss: 6.465522E+00 | loss scale: 4096.0 | grad norm: 12417.724 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 255/ 292968 | consumed samples: 522240 | consumed tokens: 35225600 | elapsed time per iteration (ms): 108264.6 | learning rate: 1.393E-05 | global batch size: 2048 | lm loss: 6.435391E+00 | loss scale: 4096.0 | grad norm: 9464.387 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 256/ 292968 | consumed samples: 524288 | consumed tokens: 35373056 | elapsed time per iteration (ms): 107957.4 | learning rate: 1.398E-05 | global batch size: 2048 | lm loss: 6.436907E+00 | loss scale: 4096.0 | grad norm: 8957.010 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 257/ 292968 | consumed samples: 526336 | consumed tokens: 35520512 | elapsed time per iteration (ms): 109517.5 | learning rate: 1.404E-05 | global batch size: 2048 | lm loss: 6.413041E+00 | loss scale: 4096.0 | grad norm: 11170.481 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 258/ 292968 | consumed samples: 528384 | consumed tokens: 35667968 | elapsed time per iteration (ms): 109479.3 | learning rate: 1.409E-05 | global batch size: 2048 | lm loss: 6.400558E+00 | loss scale: 4096.0 | grad norm: 10956.268 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 259/ 292968 | consumed samples: 530432 | consumed tokens: 35815424 | elapsed time per iteration (ms): 108905.5 | learning rate: 1.414E-05 | global batch size: 2048 | lm loss: 6.422109E+00 | loss scale: 4096.0 | grad norm: 8642.350 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 260/ 292968 | consumed samples: 532480 | consumed tokens: 35962880 | elapsed time per iteration (ms): 107495.9 | learning rate: 1.420E-05 | global batch size: 2048 | lm loss: 6.398808E+00 | loss scale: 4096.0 | grad norm: 10964.468 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 261/ 292968 | consumed samples: 534528 | consumed tokens: 36110336 | elapsed time per iteration (ms): 108634.4 | learning rate: 1.425E-05 | global batch size: 2048 | lm loss: 6.388765E+00 | loss scale: 4096.0 | grad norm: 11237.345 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 262/ 292968 | consumed samples: 536576 | consumed tokens: 36257792 | elapsed time per iteration (ms): 108579.7 | learning rate: 1.431E-05 | global batch size: 2048 | lm loss: 6.385891E+00 | loss scale: 4096.0 | grad norm: 10603.256 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 263/ 292968 | consumed samples: 538624 | consumed tokens: 36405248 | elapsed time per iteration (ms): 110073.0 | learning rate: 1.436E-05 | global batch size: 2048 | lm loss: 6.399619E+00 | loss scale: 4096.0 | grad norm: 8039.348 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 264/ 292968 | consumed samples: 540672 | consumed tokens: 36552704 | elapsed time per iteration (ms): 109166.7 | learning rate: 1.442E-05 | global batch size: 2048 | lm loss: 6.395229E+00 | loss scale: 4096.0 | grad norm: 10842.676 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 265/ 292968 | consumed samples: 542720 | consumed tokens: 36700160 | elapsed time per iteration (ms): 108083.3 | learning rate: 1.447E-05 | global batch size: 2048 | lm loss: 6.383315E+00 | loss scale: 4096.0 | grad norm: 11138.567 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 266/ 292968 | consumed samples: 544768 | consumed tokens: 36847616 | elapsed time per iteration (ms): 110890.5 | learning rate: 1.453E-05 | global batch size: 2048 | lm loss: 6.366701E+00 | loss scale: 4096.0 | grad norm: 8608.717 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 267/ 292968 | consumed samples: 546816 | consumed tokens: 36995072 | elapsed time per iteration (ms): 109704.9 | learning rate: 1.458E-05 | global batch size: 2048 | lm loss: 6.374611E+00 | loss scale: 4096.0 | grad norm: 15404.370 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 268/ 292968 | consumed samples: 548864 | consumed tokens: 37142528 | elapsed time per iteration (ms): 121388.7 | learning rate: 1.464E-05 | global batch size: 2048 | lm loss: 6.387739E+00 | loss scale: 4096.0 | grad norm: 10116.191 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 269/ 292968 | consumed samples: 550912 | consumed tokens: 37289984 | elapsed time per iteration (ms): 110967.4 | learning rate: 1.469E-05 | global batch size: 2048 | lm loss: 6.363533E+00 | loss scale: 4096.0 | grad norm: 9367.284 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 270/ 292968 | consumed samples: 552960 | consumed tokens: 37437440 | elapsed time per iteration (ms): 115082.1 | learning rate: 1.475E-05 | global batch size: 2048 | lm loss: 6.338829E+00 | loss scale: 4096.0 | grad norm: 7684.800 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 271/ 292968 | consumed samples: 555008 | consumed tokens: 37584896 | elapsed time per iteration (ms): 115549.9 | learning rate: 1.480E-05 | global batch size: 2048 | lm loss: 6.348468E+00 | loss scale: 4096.0 | grad norm: 11673.500 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 272/ 292968 | consumed samples: 557056 | consumed tokens: 37732352 | elapsed time per iteration (ms): 111654.7 | learning rate: 1.485E-05 | global batch size: 2048 | lm loss: 6.331059E+00 | loss scale: 4096.0 | grad norm: 8199.100 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 273/ 292968 | consumed samples: 559104 | consumed tokens: 37879808 | elapsed time per iteration (ms): 109780.8 | learning rate: 1.491E-05 | global batch size: 2048 | lm loss: 6.350784E+00 | loss scale: 4096.0 | grad norm: 9073.286 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 274/ 292968 | consumed samples: 561152 | consumed tokens: 38027264 | elapsed time per iteration (ms): 109479.4 | learning rate: 1.496E-05 | global batch size: 2048 | lm loss: 6.319507E+00 | loss scale: 4096.0 | grad norm: 8731.338 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 275/ 292968 | consumed samples: 563200 | consumed tokens: 38174720 | elapsed time per iteration (ms): 108967.3 | learning rate: 1.502E-05 | global batch size: 2048 | lm loss: 6.315341E+00 | loss scale: 4096.0 | grad norm: 6636.142 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 276/ 292968 | consumed samples: 565248 | consumed tokens: 38322176 | elapsed time per iteration (ms): 108995.0 | learning rate: 1.507E-05 | global batch size: 2048 | lm loss: 6.329383E+00 | loss scale: 4096.0 | grad norm: 12850.433 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 277/ 292968 | consumed samples: 567296 | consumed tokens: 38469632 | elapsed time per iteration (ms): 109319.9 | learning rate: 1.513E-05 | global batch size: 2048 | lm loss: 6.327714E+00 | loss scale: 4096.0 | grad norm: 8193.709 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 278/ 292968 | consumed samples: 569344 | consumed tokens: 38617088 | elapsed time per iteration (ms): 108694.9 | learning rate: 1.518E-05 | global batch size: 2048 | lm loss: 6.327637E+00 | loss scale: 4096.0 | grad norm: 10361.149 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 279/ 292968 | consumed samples: 571392 | consumed tokens: 38764544 | elapsed time per iteration (ms): 110270.3 | learning rate: 1.524E-05 | global batch size: 2048 | lm loss: 6.325108E+00 | loss scale: 4096.0 | grad norm: 7427.475 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 280/ 292968 | consumed samples: 573440 | consumed tokens: 38912000 | elapsed time per iteration (ms): 108974.6 | learning rate: 1.529E-05 | global batch size: 2048 | lm loss: 6.330306E+00 | loss scale: 4096.0 | grad norm: 12621.294 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 281/ 292968 | consumed samples: 575488 | consumed tokens: 39059456 | elapsed time per iteration (ms): 110050.7 | learning rate: 1.535E-05 | global batch size: 2048 | lm loss: 6.316774E+00 | loss scale: 4096.0 | grad norm: 8772.798 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 282/ 292968 | consumed samples: 577536 | consumed tokens: 39206912 | elapsed time per iteration (ms): 109956.6 | learning rate: 1.540E-05 | global batch size: 2048 | lm loss: 6.313440E+00 | loss scale: 4096.0 | grad norm: 9058.110 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 283/ 292968 | consumed samples: 579584 | consumed tokens: 39354368 | elapsed time per iteration (ms): 109511.7 | learning rate: 1.546E-05 | global batch size: 2048 | lm loss: 6.306503E+00 | loss scale: 4096.0 | grad norm: 12318.138 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 284/ 292968 | consumed samples: 581632 | consumed tokens: 39501824 | elapsed time per iteration (ms): 109573.9 | learning rate: 1.551E-05 | global batch size: 2048 | lm loss: 6.323323E+00 | loss scale: 4096.0 | grad norm: 11230.151 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 285/ 292968 | consumed samples: 583680 | consumed tokens: 39649280 | elapsed time per iteration (ms): 109101.4 | learning rate: 1.556E-05 | global batch size: 2048 | lm loss: 6.304620E+00 | loss scale: 4096.0 | grad norm: 7445.564 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 286/ 292968 | consumed samples: 585728 | consumed tokens: 39796736 | elapsed time per iteration (ms): 108795.7 | learning rate: 1.562E-05 | global batch size: 2048 | lm loss: 6.321280E+00 | loss scale: 4096.0 | grad norm: 13547.480 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 287/ 292968 | consumed samples: 587776 | consumed tokens: 39944192 | elapsed time per iteration (ms): 108637.8 | learning rate: 1.567E-05 | global batch size: 2048 | lm loss: 6.304349E+00 | loss scale: 4096.0 | grad norm: 11384.947 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 288/ 292968 | consumed samples: 589824 | consumed tokens: 40091648 | elapsed time per iteration (ms): 108691.2 | learning rate: 1.573E-05 | global batch size: 2048 | lm loss: 6.283967E+00 | loss scale: 4096.0 | grad norm: 8260.212 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 289/ 292968 | consumed samples: 591872 | consumed tokens: 40239104 | elapsed time per iteration (ms): 109618.4 | learning rate: 1.578E-05 | global batch size: 2048 | lm loss: 6.322189E+00 | loss scale: 4096.0 | grad norm: 10440.905 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 290/ 292968 | consumed samples: 593920 | consumed tokens: 40386560 | elapsed time per iteration (ms): 108057.1 | learning rate: 1.584E-05 | global batch size: 2048 | lm loss: 6.298853E+00 | loss scale: 4096.0 | grad norm: 11900.913 | num zeros: 0.0 | curriculum seqlen: 72 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 291/ 292968 | consumed samples: 595968 | consumed tokens: 40550400 | elapsed time per iteration (ms): 108750.9 | learning rate: 1.589E-05 | global batch size: 2048 | lm loss: 6.305848E+00 | loss scale: 4096.0 | grad norm: 9601.247 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 292/ 292968 | consumed samples: 598016 | consumed tokens: 40714240 | elapsed time per iteration (ms): 110191.1 | learning rate: 1.595E-05 | global batch size: 2048 | lm loss: 6.315869E+00 | loss scale: 4096.0 | grad norm: 13008.336 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 293/ 292968 | consumed samples: 600064 | consumed tokens: 40878080 | elapsed time per iteration (ms): 111844.3 | learning rate: 1.600E-05 | global batch size: 2048 | lm loss: 6.328422E+00 | loss scale: 4096.0 | grad norm: 11396.638 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 294/ 292968 | consumed samples: 602112 | consumed tokens: 41041920 | elapsed time per iteration (ms): 108641.9 | learning rate: 1.606E-05 | global batch size: 2048 | lm loss: 6.324135E+00 | loss scale: 4096.0 | grad norm: 8693.609 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 295/ 292968 | consumed samples: 604160 | consumed tokens: 41205760 | elapsed time per iteration (ms): 109558.0 | learning rate: 1.611E-05 | global batch size: 2048 | lm loss: 6.297732E+00 | loss scale: 4096.0 | grad norm: 13696.182 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 296/ 292968 | consumed samples: 606208 | consumed tokens: 41369600 | elapsed time per iteration (ms): 109749.6 | learning rate: 1.617E-05 | global batch size: 2048 | lm loss: 6.281199E+00 | loss scale: 4096.0 | grad norm: 8949.922 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 297/ 292968 | consumed samples: 608256 | consumed tokens: 41533440 | elapsed time per iteration (ms): 111113.5 | learning rate: 1.622E-05 | global batch size: 2048 | lm loss: 6.274428E+00 | loss scale: 4096.0 | grad norm: 10521.629 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 298/ 292968 | consumed samples: 610304 | consumed tokens: 41697280 | elapsed time per iteration (ms): 109095.9 | learning rate: 1.627E-05 | global batch size: 2048 | lm loss: 6.271103E+00 | loss scale: 4096.0 | grad norm: 11913.828 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 299/ 292968 | consumed samples: 612352 | consumed tokens: 41861120 | elapsed time per iteration (ms): 111229.8 | learning rate: 1.633E-05 | global batch size: 2048 | lm loss: 6.241245E+00 | loss scale: 4096.0 | grad norm: 9488.586 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 300/ 292968 | consumed samples: 614400 | consumed tokens: 42024960 | elapsed time per iteration (ms): 110389.0 | learning rate: 1.638E-05 | global batch size: 2048 | lm loss: 6.266589E+00 | loss scale: 4096.0 | grad norm: 8618.853 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------ - validation loss at iteration 300 | lm loss value: 6.226129E+00 | lm loss PPL: 5.057938E+02 | ------------------------------------------------------------------------------------------------ -saving checkpoint at iteration 300 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-24 20:51:13,985] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/mp_rank_01_model_states.pt -[2021-10-24 20:51:14,138] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/mp_rank_00_model_states.pt -[2021-10-24 20:51:26,925] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-24 20:51:26,956] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-24 20:51:26,995] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-24 20:51:27,018] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-24 20:51:27,102] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-24 20:51:27,102] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-24 20:51:27,105] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-24 20:51:27,107] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-24 20:51:27,126] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-24 20:51:27,144] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-24 20:51:27,158] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-24 20:51:27,170] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-24 20:51:27,228] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-24 20:51:27,232] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-24 20:51:27,263] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-24 20:51:27,284] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-24 20:51:27,309] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-24 20:51:27,318] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-24 20:51:27,326] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-24 20:51:27,343] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-24 20:51:27,391] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-24 20:51:27,391] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-24 20:51:27,395] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-24 20:51:27,426] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-24 20:51:27,565] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-24 20:51:27,570] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-24 20:51:27,589] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-24 20:51:27,612] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-24 20:51:27,637] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-24 20:51:27,799] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-24 20:51:27,853] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-24 20:51:27,975] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-24 20:51:28,010] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-24 20:51:28,011] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-24 20:51:28,057] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-24 20:51:28,082] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-24 20:51:28,085] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-24 20:51:28,089] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-24 20:51:28,090] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-24 20:51:28,096] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-24 20:51:28,104] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-24 20:51:28,116] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-24 20:51:28,121] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-24 20:51:28,126] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-24 20:51:28,128] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-24 20:51:28,129] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-24 20:51:28,137] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-24 20:51:28,144] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-24 20:51:28,202] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-24 20:51:28,204] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-24 20:51:28,206] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-24 20:51:28,281] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-24 20:51:28,301] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-24 20:51:28,347] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-24 20:51:28,352] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-24 20:51:28,354] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-24 20:51:28,363] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-24 20:51:28,368] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-24 20:51:28,373] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-24 20:51:28,377] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-24 20:51:28,379] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-24 20:51:28,381] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-24 20:51:28,386] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-24 20:51:28,390] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-24 20:51:28,395] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-24 20:51:28,402] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-24 20:51:28,415] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-24 20:51:28,453] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-24 20:51:28,454] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-24 20:51:28,466] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-24 20:51:28,471] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-24 20:51:28,501] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-24 20:51:28,504] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-24 20:51:28,517] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-24 20:51:28,522] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-24 20:51:28,580] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-24 20:51:28,600] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-24 20:51:28,601] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-24 20:51:28,622] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-24 20:51:28,625] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-24 20:51:28,662] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-24 20:51:28,686] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-24 20:51:28,732] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-24 20:51:28,751] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-24 20:51:28,776] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-24 20:51:28,818] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-24 20:51:28,822] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-24 20:51:28,840] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-24 20:51:28,848] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-24 20:51:28,862] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-24 20:51:28,877] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-24 20:51:28,883] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-24 20:51:28,889] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-24 20:51:28,901] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-24 20:51:28,901] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-24 20:51:28,948] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-24 20:51:28,984] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-24 20:51:29,008] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-24 20:51:29,039] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-24 20:51:29,072] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-24 20:51:29,099] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-24 20:51:29,101] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-24 20:51:29,132] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-24 20:51:29,134] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-24 20:51:29,170] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-24 20:51:29,176] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-24 20:51:29,215] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-24 20:51:29,262] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-24 20:51:29,318] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-24 20:51:29,335] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-24 20:51:29,343] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-24 20:51:29,353] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-24 20:51:29,407] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-24 20:51:29,446] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-24 20:51:29,473] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-24 20:51:29,706] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-24 20:51:29,917] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-24 20:51:30,010] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-24 20:51:30,010] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-24 20:51:30,044] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-24 20:51:31,553] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-24 20:51:31,639] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-24 20:51:35,674] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-24 20:51:36,608] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-24 20:51:36,806] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-24 20:51:36,884] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-24 20:51:41,493] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-24 20:51:43,192] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step300/zero_pp_rank_0_mp_rank_00_optim_states.pt - successfully saved checkpoint at iteration 300 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 32193.02 - iteration 301/ 292968 | consumed samples: 616448 | consumed tokens: 42188800 | elapsed time per iteration (ms): 350290.1 | learning rate: 1.644E-05 | global batch size: 2048 | lm loss: 6.248915E+00 | loss scale: 4096.0 | grad norm: 9251.881 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 302/ 292968 | consumed samples: 618496 | consumed tokens: 42352640 | elapsed time per iteration (ms): 110979.8 | learning rate: 1.649E-05 | global batch size: 2048 | lm loss: 6.233592E+00 | loss scale: 4096.0 | grad norm: 9261.127 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 303/ 292968 | consumed samples: 620544 | consumed tokens: 42516480 | elapsed time per iteration (ms): 109402.0 | learning rate: 1.655E-05 | global batch size: 2048 | lm loss: 6.235322E+00 | loss scale: 4096.0 | grad norm: 8259.876 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 304/ 292968 | consumed samples: 622592 | consumed tokens: 42680320 | elapsed time per iteration (ms): 110441.2 | learning rate: 1.660E-05 | global batch size: 2048 | lm loss: 6.242181E+00 | loss scale: 4096.0 | grad norm: 8215.770 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 305/ 292968 | consumed samples: 624640 | consumed tokens: 42844160 | elapsed time per iteration (ms): 109060.1 | learning rate: 1.666E-05 | global batch size: 2048 | lm loss: 6.228780E+00 | loss scale: 4096.0 | grad norm: 10114.298 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 306/ 292968 | consumed samples: 626688 | consumed tokens: 43008000 | elapsed time per iteration (ms): 110875.5 | learning rate: 1.671E-05 | global batch size: 2048 | lm loss: 6.244180E+00 | loss scale: 4096.0 | grad norm: 7806.418 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 307/ 292968 | consumed samples: 628736 | consumed tokens: 43171840 | elapsed time per iteration (ms): 110940.1 | learning rate: 1.677E-05 | global batch size: 2048 | lm loss: 6.251504E+00 | loss scale: 4096.0 | grad norm: 12245.133 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 308/ 292968 | consumed samples: 630784 | consumed tokens: 43335680 | elapsed time per iteration (ms): 110572.7 | learning rate: 1.682E-05 | global batch size: 2048 | lm loss: 6.242295E+00 | loss scale: 4096.0 | grad norm: 8985.877 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 309/ 292968 | consumed samples: 632832 | consumed tokens: 43499520 | elapsed time per iteration (ms): 111400.3 | learning rate: 1.688E-05 | global batch size: 2048 | lm loss: 6.245388E+00 | loss scale: 4096.0 | grad norm: 9628.991 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 310/ 292968 | consumed samples: 634880 | consumed tokens: 43663360 | elapsed time per iteration (ms): 110470.0 | learning rate: 1.693E-05 | global batch size: 2048 | lm loss: 6.245456E+00 | loss scale: 4096.0 | grad norm: 10937.524 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 311/ 292968 | consumed samples: 636928 | consumed tokens: 43827200 | elapsed time per iteration (ms): 109105.3 | learning rate: 1.698E-05 | global batch size: 2048 | lm loss: 6.228984E+00 | loss scale: 4096.0 | grad norm: 13789.568 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 312/ 292968 | consumed samples: 638976 | consumed tokens: 43991040 | elapsed time per iteration (ms): 109819.2 | learning rate: 1.704E-05 | global batch size: 2048 | lm loss: 6.235544E+00 | loss scale: 4096.0 | grad norm: 9352.335 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 313/ 292968 | consumed samples: 641024 | consumed tokens: 44154880 | elapsed time per iteration (ms): 109518.0 | learning rate: 1.709E-05 | global batch size: 2048 | lm loss: 6.215362E+00 | loss scale: 4096.0 | grad norm: 9782.494 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 314/ 292968 | consumed samples: 643072 | consumed tokens: 44318720 | elapsed time per iteration (ms): 110565.4 | learning rate: 1.715E-05 | global batch size: 2048 | lm loss: 6.213126E+00 | loss scale: 4096.0 | grad norm: 11655.961 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 315/ 292968 | consumed samples: 645120 | consumed tokens: 44482560 | elapsed time per iteration (ms): 109404.8 | learning rate: 1.720E-05 | global batch size: 2048 | lm loss: 6.243786E+00 | loss scale: 4096.0 | grad norm: 10283.912 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 316/ 292968 | consumed samples: 647168 | consumed tokens: 44646400 | elapsed time per iteration (ms): 110479.7 | learning rate: 1.726E-05 | global batch size: 2048 | lm loss: 6.213628E+00 | loss scale: 4096.0 | grad norm: 8441.775 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 317/ 292968 | consumed samples: 649216 | consumed tokens: 44810240 | elapsed time per iteration (ms): 111950.1 | learning rate: 1.731E-05 | global batch size: 2048 | lm loss: 6.200946E+00 | loss scale: 4096.0 | grad norm: 13379.365 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 318/ 292968 | consumed samples: 651264 | consumed tokens: 44974080 | elapsed time per iteration (ms): 111272.2 | learning rate: 1.737E-05 | global batch size: 2048 | lm loss: 6.183933E+00 | loss scale: 4096.0 | grad norm: 8300.364 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 319/ 292968 | consumed samples: 653312 | consumed tokens: 45137920 | elapsed time per iteration (ms): 109729.5 | learning rate: 1.742E-05 | global batch size: 2048 | lm loss: 6.229595E+00 | loss scale: 4096.0 | grad norm: 16879.992 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 320/ 292968 | consumed samples: 655360 | consumed tokens: 45301760 | elapsed time per iteration (ms): 110601.2 | learning rate: 1.748E-05 | global batch size: 2048 | lm loss: 6.231015E+00 | loss scale: 4096.0 | grad norm: 10879.370 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 321/ 292968 | consumed samples: 657408 | consumed tokens: 45465600 | elapsed time per iteration (ms): 110569.3 | learning rate: 1.753E-05 | global batch size: 2048 | lm loss: 6.161396E+00 | loss scale: 4096.0 | grad norm: 8570.948 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 322/ 292968 | consumed samples: 659456 | consumed tokens: 45629440 | elapsed time per iteration (ms): 108954.0 | learning rate: 1.759E-05 | global batch size: 2048 | lm loss: 6.178751E+00 | loss scale: 4096.0 | grad norm: 10012.610 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 323/ 292968 | consumed samples: 661504 | consumed tokens: 45793280 | elapsed time per iteration (ms): 111976.8 | learning rate: 1.764E-05 | global batch size: 2048 | lm loss: 6.168045E+00 | loss scale: 4096.0 | grad norm: 10580.266 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 324/ 292968 | consumed samples: 663552 | consumed tokens: 45957120 | elapsed time per iteration (ms): 109721.0 | learning rate: 1.769E-05 | global batch size: 2048 | lm loss: 6.178845E+00 | loss scale: 4096.0 | grad norm: 10402.177 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 325/ 292968 | consumed samples: 665600 | consumed tokens: 46120960 | elapsed time per iteration (ms): 111305.3 | learning rate: 1.775E-05 | global batch size: 2048 | lm loss: 6.191531E+00 | loss scale: 4096.0 | grad norm: 6659.238 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 326/ 292968 | consumed samples: 667648 | consumed tokens: 46284800 | elapsed time per iteration (ms): 111054.8 | learning rate: 1.780E-05 | global batch size: 2048 | lm loss: 6.219053E+00 | loss scale: 4096.0 | grad norm: 23331.838 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 327/ 292968 | consumed samples: 669696 | consumed tokens: 46448640 | elapsed time per iteration (ms): 109631.0 | learning rate: 1.786E-05 | global batch size: 2048 | lm loss: 6.238684E+00 | loss scale: 4096.0 | grad norm: 10272.825 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 328/ 292968 | consumed samples: 671744 | consumed tokens: 46612480 | elapsed time per iteration (ms): 111035.5 | learning rate: 1.791E-05 | global batch size: 2048 | lm loss: 6.232896E+00 | loss scale: 4096.0 | grad norm: 14860.284 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 329/ 292968 | consumed samples: 673792 | consumed tokens: 46776320 | elapsed time per iteration (ms): 109214.3 | learning rate: 1.797E-05 | global batch size: 2048 | lm loss: 6.186585E+00 | loss scale: 4096.0 | grad norm: 10239.948 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 330/ 292968 | consumed samples: 675840 | consumed tokens: 46940160 | elapsed time per iteration (ms): 111084.1 | learning rate: 1.802E-05 | global batch size: 2048 | lm loss: 6.195550E+00 | loss scale: 4096.0 | grad norm: 8588.792 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 331/ 292968 | consumed samples: 677888 | consumed tokens: 47104000 | elapsed time per iteration (ms): 110797.2 | learning rate: 1.808E-05 | global batch size: 2048 | lm loss: 6.159820E+00 | loss scale: 4096.0 | grad norm: 9632.370 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 332/ 292968 | consumed samples: 679936 | consumed tokens: 47267840 | elapsed time per iteration (ms): 111874.0 | learning rate: 1.813E-05 | global batch size: 2048 | lm loss: 6.194593E+00 | loss scale: 4096.0 | grad norm: 13527.706 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 333/ 292968 | consumed samples: 681984 | consumed tokens: 47431680 | elapsed time per iteration (ms): 109490.3 | learning rate: 1.819E-05 | global batch size: 2048 | lm loss: 6.183351E+00 | loss scale: 4096.0 | grad norm: 8889.699 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 334/ 292968 | consumed samples: 684032 | consumed tokens: 47595520 | elapsed time per iteration (ms): 110927.6 | learning rate: 1.824E-05 | global batch size: 2048 | lm loss: 6.207039E+00 | loss scale: 4096.0 | grad norm: 13804.996 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 335/ 292968 | consumed samples: 686080 | consumed tokens: 47759360 | elapsed time per iteration (ms): 110163.4 | learning rate: 1.830E-05 | global batch size: 2048 | lm loss: 6.144939E+00 | loss scale: 4096.0 | grad norm: 8306.471 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 336/ 292968 | consumed samples: 688128 | consumed tokens: 47923200 | elapsed time per iteration (ms): 110221.6 | learning rate: 1.835E-05 | global batch size: 2048 | lm loss: 6.182420E+00 | loss scale: 4096.0 | grad norm: 8945.397 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 337/ 292968 | consumed samples: 690176 | consumed tokens: 48087040 | elapsed time per iteration (ms): 110563.2 | learning rate: 1.840E-05 | global batch size: 2048 | lm loss: 6.174747E+00 | loss scale: 4096.0 | grad norm: 9887.871 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 338/ 292968 | consumed samples: 692224 | consumed tokens: 48250880 | elapsed time per iteration (ms): 111009.0 | learning rate: 1.846E-05 | global batch size: 2048 | lm loss: 6.158761E+00 | loss scale: 4096.0 | grad norm: 9667.951 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 339/ 292968 | consumed samples: 694272 | consumed tokens: 48414720 | elapsed time per iteration (ms): 111177.9 | learning rate: 1.851E-05 | global batch size: 2048 | lm loss: 6.179541E+00 | loss scale: 4096.0 | grad norm: 7917.093 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 340/ 292968 | consumed samples: 696320 | consumed tokens: 48578560 | elapsed time per iteration (ms): 110359.6 | learning rate: 1.857E-05 | global batch size: 2048 | lm loss: 6.146617E+00 | loss scale: 4096.0 | grad norm: 8861.306 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 341/ 292968 | consumed samples: 698368 | consumed tokens: 48742400 | elapsed time per iteration (ms): 112066.6 | learning rate: 1.862E-05 | global batch size: 2048 | lm loss: 6.174376E+00 | loss scale: 4096.0 | grad norm: 10658.177 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 342/ 292968 | consumed samples: 700416 | consumed tokens: 48906240 | elapsed time per iteration (ms): 110247.5 | learning rate: 1.868E-05 | global batch size: 2048 | lm loss: 6.146154E+00 | loss scale: 4096.0 | grad norm: 6865.284 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 343/ 292968 | consumed samples: 702464 | consumed tokens: 49070080 | elapsed time per iteration (ms): 111574.7 | learning rate: 1.873E-05 | global batch size: 2048 | lm loss: 6.137790E+00 | loss scale: 4096.0 | grad norm: 12570.156 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 344/ 292968 | consumed samples: 704512 | consumed tokens: 49233920 | elapsed time per iteration (ms): 110106.9 | learning rate: 1.879E-05 | global batch size: 2048 | lm loss: 6.143319E+00 | loss scale: 4096.0 | grad norm: 9560.909 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 345/ 292968 | consumed samples: 706560 | consumed tokens: 49397760 | elapsed time per iteration (ms): 111575.9 | learning rate: 1.884E-05 | global batch size: 2048 | lm loss: 6.115140E+00 | loss scale: 4096.0 | grad norm: 6673.672 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 346/ 292968 | consumed samples: 708608 | consumed tokens: 49561600 | elapsed time per iteration (ms): 112722.7 | learning rate: 1.890E-05 | global batch size: 2048 | lm loss: 6.140611E+00 | loss scale: 4096.0 | grad norm: 9006.598 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 347/ 292968 | consumed samples: 710656 | consumed tokens: 49725440 | elapsed time per iteration (ms): 112166.0 | learning rate: 1.895E-05 | global batch size: 2048 | lm loss: 6.130188E+00 | loss scale: 4096.0 | grad norm: 10153.983 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 348/ 292968 | consumed samples: 712704 | consumed tokens: 49889280 | elapsed time per iteration (ms): 110618.2 | learning rate: 1.901E-05 | global batch size: 2048 | lm loss: 6.143208E+00 | loss scale: 4096.0 | grad norm: 9577.347 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 349/ 292968 | consumed samples: 714752 | consumed tokens: 50053120 | elapsed time per iteration (ms): 111403.6 | learning rate: 1.906E-05 | global batch size: 2048 | lm loss: 6.058227E+00 | loss scale: 4096.0 | grad norm: 8421.473 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 350/ 292968 | consumed samples: 716800 | consumed tokens: 50216960 | elapsed time per iteration (ms): 110771.6 | learning rate: 1.911E-05 | global batch size: 2048 | lm loss: 6.112644E+00 | loss scale: 4096.0 | grad norm: 9199.218 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 351/ 292968 | consumed samples: 718848 | consumed tokens: 50380800 | elapsed time per iteration (ms): 111351.7 | learning rate: 1.917E-05 | global batch size: 2048 | lm loss: 6.089656E+00 | loss scale: 4096.0 | grad norm: 9349.499 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 352/ 292968 | consumed samples: 720896 | consumed tokens: 50544640 | elapsed time per iteration (ms): 109546.9 | learning rate: 1.922E-05 | global batch size: 2048 | lm loss: 6.054612E+00 | loss scale: 4096.0 | grad norm: 4868.792 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 353/ 292968 | consumed samples: 722944 | consumed tokens: 50708480 | elapsed time per iteration (ms): 111183.3 | learning rate: 1.928E-05 | global batch size: 2048 | lm loss: 6.129261E+00 | loss scale: 4096.0 | grad norm: 11432.620 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 354/ 292968 | consumed samples: 724992 | consumed tokens: 50872320 | elapsed time per iteration (ms): 110597.0 | learning rate: 1.933E-05 | global batch size: 2048 | lm loss: 6.092914E+00 | loss scale: 4096.0 | grad norm: 6716.544 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 355/ 292968 | consumed samples: 727040 | consumed tokens: 51036160 | elapsed time per iteration (ms): 110532.8 | learning rate: 1.939E-05 | global batch size: 2048 | lm loss: 6.119990E+00 | loss scale: 4096.0 | grad norm: 9670.629 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 356/ 292968 | consumed samples: 729088 | consumed tokens: 51200000 | elapsed time per iteration (ms): 112688.6 | learning rate: 1.944E-05 | global batch size: 2048 | lm loss: 6.099743E+00 | loss scale: 4096.0 | grad norm: 7866.218 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 357/ 292968 | consumed samples: 731136 | consumed tokens: 51363840 | elapsed time per iteration (ms): 110315.3 | learning rate: 1.950E-05 | global batch size: 2048 | lm loss: 6.068275E+00 | loss scale: 4096.0 | grad norm: 8774.940 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 358/ 292968 | consumed samples: 733184 | consumed tokens: 51527680 | elapsed time per iteration (ms): 112965.8 | learning rate: 1.955E-05 | global batch size: 2048 | lm loss: 6.096206E+00 | loss scale: 4096.0 | grad norm: 7280.418 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 359/ 292968 | consumed samples: 735232 | consumed tokens: 51691520 | elapsed time per iteration (ms): 109588.3 | learning rate: 1.961E-05 | global batch size: 2048 | lm loss: 6.114758E+00 | loss scale: 4096.0 | grad norm: 8412.337 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 360/ 292968 | consumed samples: 737280 | consumed tokens: 51855360 | elapsed time per iteration (ms): 111638.6 | learning rate: 1.966E-05 | global batch size: 2048 | lm loss: 6.104151E+00 | loss scale: 4096.0 | grad norm: 6553.513 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 361/ 292968 | consumed samples: 739328 | consumed tokens: 52019200 | elapsed time per iteration (ms): 111314.5 | learning rate: 1.972E-05 | global batch size: 2048 | lm loss: 6.076555E+00 | loss scale: 4096.0 | grad norm: 8810.296 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 362/ 292968 | consumed samples: 741376 | consumed tokens: 52183040 | elapsed time per iteration (ms): 110736.0 | learning rate: 1.977E-05 | global batch size: 2048 | lm loss: 6.063091E+00 | loss scale: 4096.0 | grad norm: 9564.015 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 363/ 292968 | consumed samples: 743424 | consumed tokens: 52346880 | elapsed time per iteration (ms): 110896.1 | learning rate: 1.982E-05 | global batch size: 2048 | lm loss: 6.067285E+00 | loss scale: 4096.0 | grad norm: 8732.418 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 364/ 292968 | consumed samples: 745472 | consumed tokens: 52510720 | elapsed time per iteration (ms): 110464.3 | learning rate: 1.988E-05 | global batch size: 2048 | lm loss: 6.045290E+00 | loss scale: 4096.0 | grad norm: 7911.537 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 365/ 292968 | consumed samples: 747520 | consumed tokens: 52674560 | elapsed time per iteration (ms): 111178.2 | learning rate: 1.993E-05 | global batch size: 2048 | lm loss: 6.032138E+00 | loss scale: 4096.0 | grad norm: 11692.026 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 366/ 292968 | consumed samples: 749568 | consumed tokens: 52838400 | elapsed time per iteration (ms): 110599.3 | learning rate: 1.999E-05 | global batch size: 2048 | lm loss: 6.062290E+00 | loss scale: 4096.0 | grad norm: 8750.678 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 367/ 292968 | consumed samples: 751616 | consumed tokens: 53002240 | elapsed time per iteration (ms): 109835.9 | learning rate: 2.004E-05 | global batch size: 2048 | lm loss: 6.090322E+00 | loss scale: 4096.0 | grad norm: 10644.854 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 368/ 292968 | consumed samples: 753664 | consumed tokens: 53166080 | elapsed time per iteration (ms): 110521.3 | learning rate: 2.010E-05 | global batch size: 2048 | lm loss: 6.074631E+00 | loss scale: 4096.0 | grad norm: 9220.344 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 369/ 292968 | consumed samples: 755712 | consumed tokens: 53329920 | elapsed time per iteration (ms): 111918.6 | learning rate: 2.015E-05 | global batch size: 2048 | lm loss: 6.053720E+00 | loss scale: 4096.0 | grad norm: 8940.859 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 370/ 292968 | consumed samples: 757760 | consumed tokens: 53493760 | elapsed time per iteration (ms): 110422.7 | learning rate: 2.021E-05 | global batch size: 2048 | lm loss: 6.049482E+00 | loss scale: 4096.0 | grad norm: 6966.516 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 371/ 292968 | consumed samples: 759808 | consumed tokens: 53657600 | elapsed time per iteration (ms): 111322.5 | learning rate: 2.026E-05 | global batch size: 2048 | lm loss: 6.030096E+00 | loss scale: 4096.0 | grad norm: 10472.816 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 372/ 292968 | consumed samples: 761856 | consumed tokens: 53821440 | elapsed time per iteration (ms): 110377.0 | learning rate: 2.032E-05 | global batch size: 2048 | lm loss: 6.065630E+00 | loss scale: 4096.0 | grad norm: 8343.691 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 373/ 292968 | consumed samples: 763904 | consumed tokens: 53985280 | elapsed time per iteration (ms): 109866.0 | learning rate: 2.037E-05 | global batch size: 2048 | lm loss: 6.073018E+00 | loss scale: 4096.0 | grad norm: 7894.417 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 374/ 292968 | consumed samples: 765952 | consumed tokens: 54149120 | elapsed time per iteration (ms): 112326.4 | learning rate: 2.043E-05 | global batch size: 2048 | lm loss: 6.047641E+00 | loss scale: 4096.0 | grad norm: 9539.723 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 375/ 292968 | consumed samples: 768000 | consumed tokens: 54312960 | elapsed time per iteration (ms): 109222.3 | learning rate: 2.048E-05 | global batch size: 2048 | lm loss: 6.017626E+00 | loss scale: 4096.0 | grad norm: 5641.349 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 376/ 292968 | consumed samples: 770048 | consumed tokens: 54476800 | elapsed time per iteration (ms): 111056.2 | learning rate: 2.053E-05 | global batch size: 2048 | lm loss: 6.041435E+00 | loss scale: 4096.0 | grad norm: 9676.166 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 377/ 292968 | consumed samples: 772096 | consumed tokens: 54640640 | elapsed time per iteration (ms): 110553.4 | learning rate: 2.059E-05 | global batch size: 2048 | lm loss: 6.022824E+00 | loss scale: 4096.0 | grad norm: 7117.206 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 378/ 292968 | consumed samples: 774144 | consumed tokens: 54804480 | elapsed time per iteration (ms): 112646.4 | learning rate: 2.064E-05 | global batch size: 2048 | lm loss: 6.018000E+00 | loss scale: 4096.0 | grad norm: 6769.929 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 379/ 292968 | consumed samples: 776192 | consumed tokens: 54968320 | elapsed time per iteration (ms): 110276.7 | learning rate: 2.070E-05 | global batch size: 2048 | lm loss: 6.019231E+00 | loss scale: 4096.0 | grad norm: 8731.896 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 380/ 292968 | consumed samples: 778240 | consumed tokens: 55132160 | elapsed time per iteration (ms): 111014.1 | learning rate: 2.075E-05 | global batch size: 2048 | lm loss: 6.022727E+00 | loss scale: 4096.0 | grad norm: 5855.788 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 381/ 292968 | consumed samples: 780288 | consumed tokens: 55296000 | elapsed time per iteration (ms): 109942.5 | learning rate: 2.081E-05 | global batch size: 2048 | lm loss: 6.015767E+00 | loss scale: 4096.0 | grad norm: 9438.092 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 382/ 292968 | consumed samples: 782336 | consumed tokens: 55459840 | elapsed time per iteration (ms): 112087.0 | learning rate: 2.086E-05 | global batch size: 2048 | lm loss: 6.003777E+00 | loss scale: 4096.0 | grad norm: 8323.425 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 383/ 292968 | consumed samples: 784384 | consumed tokens: 55623680 | elapsed time per iteration (ms): 111200.1 | learning rate: 2.092E-05 | global batch size: 2048 | lm loss: 6.008110E+00 | loss scale: 4096.0 | grad norm: 8577.739 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 384/ 292968 | consumed samples: 786432 | consumed tokens: 55787520 | elapsed time per iteration (ms): 111467.0 | learning rate: 2.097E-05 | global batch size: 2048 | lm loss: 6.044541E+00 | loss scale: 4096.0 | grad norm: 9773.609 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 385/ 292968 | consumed samples: 788480 | consumed tokens: 55951360 | elapsed time per iteration (ms): 110512.7 | learning rate: 2.103E-05 | global batch size: 2048 | lm loss: 6.002804E+00 | loss scale: 4096.0 | grad norm: 7430.040 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 386/ 292968 | consumed samples: 790528 | consumed tokens: 56115200 | elapsed time per iteration (ms): 110515.3 | learning rate: 2.108E-05 | global batch size: 2048 | lm loss: 6.008804E+00 | loss scale: 4096.0 | grad norm: 7985.891 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 387/ 292968 | consumed samples: 792576 | consumed tokens: 56279040 | elapsed time per iteration (ms): 109627.9 | learning rate: 2.114E-05 | global batch size: 2048 | lm loss: 5.993518E+00 | loss scale: 4096.0 | grad norm: 8976.041 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 388/ 292968 | consumed samples: 794624 | consumed tokens: 56442880 | elapsed time per iteration (ms): 111786.5 | learning rate: 2.119E-05 | global batch size: 2048 | lm loss: 5.981034E+00 | loss scale: 4096.0 | grad norm: 7076.540 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 389/ 292968 | consumed samples: 796672 | consumed tokens: 56606720 | elapsed time per iteration (ms): 110291.7 | learning rate: 2.124E-05 | global batch size: 2048 | lm loss: 5.990614E+00 | loss scale: 4096.0 | grad norm: 6554.702 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 390/ 292968 | consumed samples: 798720 | consumed tokens: 56770560 | elapsed time per iteration (ms): 111362.0 | learning rate: 2.130E-05 | global batch size: 2048 | lm loss: 5.982703E+00 | loss scale: 4096.0 | grad norm: 9555.875 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 391/ 292968 | consumed samples: 800768 | consumed tokens: 56934400 | elapsed time per iteration (ms): 111112.9 | learning rate: 2.135E-05 | global batch size: 2048 | lm loss: 5.961536E+00 | loss scale: 4096.0 | grad norm: 6745.755 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 392/ 292968 | consumed samples: 802816 | consumed tokens: 57098240 | elapsed time per iteration (ms): 111787.3 | learning rate: 2.141E-05 | global batch size: 2048 | lm loss: 5.970945E+00 | loss scale: 4096.0 | grad norm: 7857.538 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 393/ 292968 | consumed samples: 804864 | consumed tokens: 57262080 | elapsed time per iteration (ms): 111411.4 | learning rate: 2.146E-05 | global batch size: 2048 | lm loss: 5.962298E+00 | loss scale: 4096.0 | grad norm: 9574.464 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 394/ 292968 | consumed samples: 806912 | consumed tokens: 57425920 | elapsed time per iteration (ms): 111772.2 | learning rate: 2.152E-05 | global batch size: 2048 | lm loss: 5.989485E+00 | loss scale: 4096.0 | grad norm: 7933.256 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 395/ 292968 | consumed samples: 808960 | consumed tokens: 57589760 | elapsed time per iteration (ms): 110320.2 | learning rate: 2.157E-05 | global batch size: 2048 | lm loss: 5.965234E+00 | loss scale: 4096.0 | grad norm: 9428.165 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 396/ 292968 | consumed samples: 811008 | consumed tokens: 57753600 | elapsed time per iteration (ms): 110804.7 | learning rate: 2.163E-05 | global batch size: 2048 | lm loss: 5.937716E+00 | loss scale: 4096.0 | grad norm: 8460.811 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 397/ 292968 | consumed samples: 813056 | consumed tokens: 57917440 | elapsed time per iteration (ms): 111654.9 | learning rate: 2.168E-05 | global batch size: 2048 | lm loss: 5.942237E+00 | loss scale: 4096.0 | grad norm: 7390.073 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 398/ 292968 | consumed samples: 815104 | consumed tokens: 58081280 | elapsed time per iteration (ms): 110279.9 | learning rate: 2.174E-05 | global batch size: 2048 | lm loss: 5.927762E+00 | loss scale: 4096.0 | grad norm: 9312.831 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 399/ 292968 | consumed samples: 817152 | consumed tokens: 58245120 | elapsed time per iteration (ms): 111751.3 | learning rate: 2.179E-05 | global batch size: 2048 | lm loss: 5.935436E+00 | loss scale: 4096.0 | grad norm: 7319.939 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 400/ 292968 | consumed samples: 819200 | consumed tokens: 58408960 | elapsed time per iteration (ms): 109502.4 | learning rate: 2.185E-05 | global batch size: 2048 | lm loss: 5.967855E+00 | loss scale: 4096.0 | grad norm: 6157.040 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 401/ 292968 | consumed samples: 821248 | consumed tokens: 58572800 | elapsed time per iteration (ms): 111228.7 | learning rate: 2.190E-05 | global batch size: 2048 | lm loss: 5.958130E+00 | loss scale: 4096.0 | grad norm: 10193.693 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 402/ 292968 | consumed samples: 823296 | consumed tokens: 58736640 | elapsed time per iteration (ms): 110849.9 | learning rate: 2.195E-05 | global batch size: 2048 | lm loss: 5.956155E+00 | loss scale: 4096.0 | grad norm: 7416.948 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 403/ 292968 | consumed samples: 825344 | consumed tokens: 58900480 | elapsed time per iteration (ms): 111409.2 | learning rate: 2.201E-05 | global batch size: 2048 | lm loss: 5.939478E+00 | loss scale: 4096.0 | grad norm: 10877.953 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 404/ 292968 | consumed samples: 827392 | consumed tokens: 59064320 | elapsed time per iteration (ms): 110710.9 | learning rate: 2.206E-05 | global batch size: 2048 | lm loss: 5.979051E+00 | loss scale: 4096.0 | grad norm: 8341.098 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 405/ 292968 | consumed samples: 829440 | consumed tokens: 59228160 | elapsed time per iteration (ms): 111054.7 | learning rate: 2.212E-05 | global batch size: 2048 | lm loss: 5.940287E+00 | loss scale: 4096.0 | grad norm: 8307.108 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 406/ 292968 | consumed samples: 831488 | consumed tokens: 59392000 | elapsed time per iteration (ms): 110861.1 | learning rate: 2.217E-05 | global batch size: 2048 | lm loss: 5.927485E+00 | loss scale: 4096.0 | grad norm: 6381.366 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 407/ 292968 | consumed samples: 833536 | consumed tokens: 59555840 | elapsed time per iteration (ms): 111740.3 | learning rate: 2.223E-05 | global batch size: 2048 | lm loss: 5.919347E+00 | loss scale: 4096.0 | grad norm: 7869.393 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 408/ 292968 | consumed samples: 835584 | consumed tokens: 59719680 | elapsed time per iteration (ms): 111428.8 | learning rate: 2.228E-05 | global batch size: 2048 | lm loss: 5.930279E+00 | loss scale: 4096.0 | grad norm: 7236.359 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 409/ 292968 | consumed samples: 837632 | consumed tokens: 59883520 | elapsed time per iteration (ms): 111998.1 | learning rate: 2.234E-05 | global batch size: 2048 | lm loss: 5.933620E+00 | loss scale: 4096.0 | grad norm: 11345.888 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 410/ 292968 | consumed samples: 839680 | consumed tokens: 60047360 | elapsed time per iteration (ms): 110509.9 | learning rate: 2.239E-05 | global batch size: 2048 | lm loss: 5.911540E+00 | loss scale: 4096.0 | grad norm: 6714.554 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 411/ 292968 | consumed samples: 841728 | consumed tokens: 60211200 | elapsed time per iteration (ms): 111105.7 | learning rate: 2.245E-05 | global batch size: 2048 | lm loss: 5.929781E+00 | loss scale: 4096.0 | grad norm: 8914.103 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 412/ 292968 | consumed samples: 843776 | consumed tokens: 60375040 | elapsed time per iteration (ms): 112271.4 | learning rate: 2.250E-05 | global batch size: 2048 | lm loss: 5.920529E+00 | loss scale: 4096.0 | grad norm: 6486.793 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 413/ 292968 | consumed samples: 845824 | consumed tokens: 60538880 | elapsed time per iteration (ms): 109017.1 | learning rate: 2.256E-05 | global batch size: 2048 | lm loss: 5.887307E+00 | loss scale: 4096.0 | grad norm: 10389.662 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 414/ 292968 | consumed samples: 847872 | consumed tokens: 60702720 | elapsed time per iteration (ms): 111271.4 | learning rate: 2.261E-05 | global batch size: 2048 | lm loss: 5.913101E+00 | loss scale: 4096.0 | grad norm: 6550.185 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 415/ 292968 | consumed samples: 849920 | consumed tokens: 60866560 | elapsed time per iteration (ms): 112293.4 | learning rate: 2.266E-05 | global batch size: 2048 | lm loss: 5.934922E+00 | loss scale: 4096.0 | grad norm: 7186.484 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 416/ 292968 | consumed samples: 851968 | consumed tokens: 61030400 | elapsed time per iteration (ms): 111016.4 | learning rate: 2.272E-05 | global batch size: 2048 | lm loss: 5.934074E+00 | loss scale: 4096.0 | grad norm: 8400.177 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 417/ 292968 | consumed samples: 854016 | consumed tokens: 61194240 | elapsed time per iteration (ms): 110903.7 | learning rate: 2.277E-05 | global batch size: 2048 | lm loss: 5.908431E+00 | loss scale: 4096.0 | grad norm: 8875.847 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 418/ 292968 | consumed samples: 856064 | consumed tokens: 61358080 | elapsed time per iteration (ms): 111282.2 | learning rate: 2.283E-05 | global batch size: 2048 | lm loss: 5.905128E+00 | loss scale: 4096.0 | grad norm: 8686.415 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 419/ 292968 | consumed samples: 858112 | consumed tokens: 61521920 | elapsed time per iteration (ms): 110587.3 | learning rate: 2.288E-05 | global batch size: 2048 | lm loss: 5.893132E+00 | loss scale: 4096.0 | grad norm: 6899.675 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 420/ 292968 | consumed samples: 860160 | consumed tokens: 61685760 | elapsed time per iteration (ms): 111547.4 | learning rate: 2.294E-05 | global batch size: 2048 | lm loss: 5.879992E+00 | loss scale: 4096.0 | grad norm: 9016.726 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 421/ 292968 | consumed samples: 862208 | consumed tokens: 61849600 | elapsed time per iteration (ms): 110682.4 | learning rate: 2.299E-05 | global batch size: 2048 | lm loss: 5.891510E+00 | loss scale: 4096.0 | grad norm: 6700.583 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 422/ 292968 | consumed samples: 864256 | consumed tokens: 62013440 | elapsed time per iteration (ms): 111400.6 | learning rate: 2.305E-05 | global batch size: 2048 | lm loss: 5.858379E+00 | loss scale: 4096.0 | grad norm: 9252.917 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 423/ 292968 | consumed samples: 866304 | consumed tokens: 62177280 | elapsed time per iteration (ms): 110318.0 | learning rate: 2.310E-05 | global batch size: 2048 | lm loss: 5.902158E+00 | loss scale: 4096.0 | grad norm: 7999.601 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 424/ 292968 | consumed samples: 868352 | consumed tokens: 62341120 | elapsed time per iteration (ms): 111791.5 | learning rate: 2.316E-05 | global batch size: 2048 | lm loss: 5.875941E+00 | loss scale: 4096.0 | grad norm: 7035.342 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 425/ 292968 | consumed samples: 870400 | consumed tokens: 62504960 | elapsed time per iteration (ms): 111774.3 | learning rate: 2.321E-05 | global batch size: 2048 | lm loss: 5.882755E+00 | loss scale: 4096.0 | grad norm: 7678.213 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 426/ 292968 | consumed samples: 872448 | consumed tokens: 62668800 | elapsed time per iteration (ms): 110192.2 | learning rate: 2.327E-05 | global batch size: 2048 | lm loss: 5.864240E+00 | loss scale: 4096.0 | grad norm: 6738.731 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 427/ 292968 | consumed samples: 874496 | consumed tokens: 62832640 | elapsed time per iteration (ms): 111908.1 | learning rate: 2.332E-05 | global batch size: 2048 | lm loss: 5.889598E+00 | loss scale: 4096.0 | grad norm: 8525.413 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 428/ 292968 | consumed samples: 876544 | consumed tokens: 62996480 | elapsed time per iteration (ms): 110394.5 | learning rate: 2.337E-05 | global batch size: 2048 | lm loss: 5.862865E+00 | loss scale: 4096.0 | grad norm: 7663.949 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 429/ 292968 | consumed samples: 878592 | consumed tokens: 63160320 | elapsed time per iteration (ms): 110551.1 | learning rate: 2.343E-05 | global batch size: 2048 | lm loss: 5.849281E+00 | loss scale: 4096.0 | grad norm: 8562.605 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 430/ 292968 | consumed samples: 880640 | consumed tokens: 63324160 | elapsed time per iteration (ms): 113248.9 | learning rate: 2.348E-05 | global batch size: 2048 | lm loss: 5.853822E+00 | loss scale: 4096.0 | grad norm: 5995.232 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 431/ 292968 | consumed samples: 882688 | consumed tokens: 63488000 | elapsed time per iteration (ms): 109731.3 | learning rate: 2.354E-05 | global batch size: 2048 | lm loss: 5.841829E+00 | loss scale: 4096.0 | grad norm: 7528.770 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 432/ 292968 | consumed samples: 884736 | consumed tokens: 63651840 | elapsed time per iteration (ms): 110793.7 | learning rate: 2.359E-05 | global batch size: 2048 | lm loss: 5.848011E+00 | loss scale: 4096.0 | grad norm: 7341.500 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 433/ 292968 | consumed samples: 886784 | consumed tokens: 63815680 | elapsed time per iteration (ms): 111233.5 | learning rate: 2.365E-05 | global batch size: 2048 | lm loss: 5.847678E+00 | loss scale: 4096.0 | grad norm: 6375.711 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 434/ 292968 | consumed samples: 888832 | consumed tokens: 63979520 | elapsed time per iteration (ms): 110543.2 | learning rate: 2.370E-05 | global batch size: 2048 | lm loss: 5.864359E+00 | loss scale: 4096.0 | grad norm: 7702.682 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 435/ 292968 | consumed samples: 890880 | consumed tokens: 64143360 | elapsed time per iteration (ms): 111519.5 | learning rate: 2.376E-05 | global batch size: 2048 | lm loss: 5.824051E+00 | loss scale: 4096.0 | grad norm: 8466.591 | num zeros: 0.0 | curriculum seqlen: 80 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 436/ 292968 | consumed samples: 892928 | consumed tokens: 64323584 | elapsed time per iteration (ms): 111920.3 | learning rate: 2.381E-05 | global batch size: 2048 | lm loss: 5.875383E+00 | loss scale: 4096.0 | grad norm: 8202.423 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 437/ 292968 | consumed samples: 894976 | consumed tokens: 64503808 | elapsed time per iteration (ms): 111460.1 | learning rate: 2.387E-05 | global batch size: 2048 | lm loss: 5.860913E+00 | loss scale: 4096.0 | grad norm: 7979.796 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 438/ 292968 | consumed samples: 897024 | consumed tokens: 64684032 | elapsed time per iteration (ms): 111081.1 | learning rate: 2.392E-05 | global batch size: 2048 | lm loss: 5.884607E+00 | loss scale: 4096.0 | grad norm: 8414.805 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 439/ 292968 | consumed samples: 899072 | consumed tokens: 64864256 | elapsed time per iteration (ms): 112145.0 | learning rate: 2.398E-05 | global batch size: 2048 | lm loss: 5.869011E+00 | loss scale: 4096.0 | grad norm: 8449.181 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 440/ 292968 | consumed samples: 901120 | consumed tokens: 65044480 | elapsed time per iteration (ms): 110244.3 | learning rate: 2.403E-05 | global batch size: 2048 | lm loss: 5.863712E+00 | loss scale: 4096.0 | grad norm: 7647.721 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 441/ 292968 | consumed samples: 903168 | consumed tokens: 65224704 | elapsed time per iteration (ms): 113414.1 | learning rate: 2.408E-05 | global batch size: 2048 | lm loss: 5.844038E+00 | loss scale: 4096.0 | grad norm: 7491.108 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 442/ 292968 | consumed samples: 905216 | consumed tokens: 65404928 | elapsed time per iteration (ms): 112488.2 | learning rate: 2.414E-05 | global batch size: 2048 | lm loss: 5.835303E+00 | loss scale: 4096.0 | grad norm: 7486.098 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 443/ 292968 | consumed samples: 907264 | consumed tokens: 65585152 | elapsed time per iteration (ms): 112575.9 | learning rate: 2.419E-05 | global batch size: 2048 | lm loss: 5.811860E+00 | loss scale: 4096.0 | grad norm: 6142.082 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 444/ 292968 | consumed samples: 909312 | consumed tokens: 65765376 | elapsed time per iteration (ms): 111519.6 | learning rate: 2.425E-05 | global batch size: 2048 | lm loss: 5.835458E+00 | loss scale: 4096.0 | grad norm: 9518.485 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 445/ 292968 | consumed samples: 911360 | consumed tokens: 65945600 | elapsed time per iteration (ms): 110315.3 | learning rate: 2.430E-05 | global batch size: 2048 | lm loss: 5.848936E+00 | loss scale: 4096.0 | grad norm: 7112.498 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 446/ 292968 | consumed samples: 913408 | consumed tokens: 66125824 | elapsed time per iteration (ms): 112267.6 | learning rate: 2.436E-05 | global batch size: 2048 | lm loss: 5.829205E+00 | loss scale: 4096.0 | grad norm: 8426.563 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 447/ 292968 | consumed samples: 915456 | consumed tokens: 66306048 | elapsed time per iteration (ms): 113057.7 | learning rate: 2.441E-05 | global batch size: 2048 | lm loss: 5.799595E+00 | loss scale: 4096.0 | grad norm: 6302.678 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 448/ 292968 | consumed samples: 917504 | consumed tokens: 66486272 | elapsed time per iteration (ms): 111313.9 | learning rate: 2.447E-05 | global batch size: 2048 | lm loss: 5.803114E+00 | loss scale: 4096.0 | grad norm: 8046.539 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 449/ 292968 | consumed samples: 919552 | consumed tokens: 66666496 | elapsed time per iteration (ms): 111056.3 | learning rate: 2.452E-05 | global batch size: 2048 | lm loss: 5.841568E+00 | loss scale: 4096.0 | grad norm: 8335.389 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 450/ 292968 | consumed samples: 921600 | consumed tokens: 66846720 | elapsed time per iteration (ms): 111837.1 | learning rate: 2.458E-05 | global batch size: 2048 | lm loss: 5.795787E+00 | loss scale: 4096.0 | grad norm: 7230.197 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------ - validation loss at iteration 450 | lm loss value: 5.797653E+00 | lm loss PPL: 3.295252E+02 | ------------------------------------------------------------------------------------------------ - iteration 451/ 292968 | consumed samples: 923648 | consumed tokens: 67026944 | elapsed time per iteration (ms): 306337.1 | learning rate: 2.463E-05 | global batch size: 2048 | lm loss: 5.839751E+00 | loss scale: 4096.0 | grad norm: 7056.284 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 452/ 292968 | consumed samples: 925696 | consumed tokens: 67207168 | elapsed time per iteration (ms): 110715.8 | learning rate: 2.469E-05 | global batch size: 2048 | lm loss: 5.841124E+00 | loss scale: 4096.0 | grad norm: 8359.813 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 453/ 292968 | consumed samples: 927744 | consumed tokens: 67387392 | elapsed time per iteration (ms): 112341.8 | learning rate: 2.474E-05 | global batch size: 2048 | lm loss: 5.786106E+00 | loss scale: 4096.0 | grad norm: 5697.561 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 454/ 292968 | consumed samples: 929792 | consumed tokens: 67567616 | elapsed time per iteration (ms): 110986.5 | learning rate: 2.479E-05 | global batch size: 2048 | lm loss: 5.813969E+00 | loss scale: 4096.0 | grad norm: 6702.502 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 455/ 292968 | consumed samples: 931840 | consumed tokens: 67747840 | elapsed time per iteration (ms): 112633.8 | learning rate: 2.485E-05 | global batch size: 2048 | lm loss: 5.833419E+00 | loss scale: 4096.0 | grad norm: 8186.556 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 456/ 292968 | consumed samples: 933888 | consumed tokens: 67928064 | elapsed time per iteration (ms): 110044.5 | learning rate: 2.490E-05 | global batch size: 2048 | lm loss: 5.788242E+00 | loss scale: 4096.0 | grad norm: 6566.319 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 457/ 292968 | consumed samples: 935936 | consumed tokens: 68108288 | elapsed time per iteration (ms): 109075.1 | learning rate: 2.496E-05 | global batch size: 2048 | lm loss: 5.787961E+00 | loss scale: 4096.0 | grad norm: 9340.291 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 458/ 292968 | consumed samples: 937984 | consumed tokens: 68288512 | elapsed time per iteration (ms): 112335.8 | learning rate: 2.501E-05 | global batch size: 2048 | lm loss: 5.796710E+00 | loss scale: 4096.0 | grad norm: 7851.998 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 459/ 292968 | consumed samples: 940032 | consumed tokens: 68468736 | elapsed time per iteration (ms): 112079.1 | learning rate: 2.507E-05 | global batch size: 2048 | lm loss: 5.768667E+00 | loss scale: 4096.0 | grad norm: 6811.706 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 460/ 292968 | consumed samples: 942080 | consumed tokens: 68648960 | elapsed time per iteration (ms): 109763.8 | learning rate: 2.512E-05 | global batch size: 2048 | lm loss: 5.788221E+00 | loss scale: 4096.0 | grad norm: 6644.422 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 461/ 292968 | consumed samples: 944128 | consumed tokens: 68829184 | elapsed time per iteration (ms): 111643.8 | learning rate: 2.518E-05 | global batch size: 2048 | lm loss: 5.792457E+00 | loss scale: 4096.0 | grad norm: 6457.879 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 462/ 292968 | consumed samples: 946176 | consumed tokens: 69009408 | elapsed time per iteration (ms): 112891.9 | learning rate: 2.523E-05 | global batch size: 2048 | lm loss: 5.787377E+00 | loss scale: 4096.0 | grad norm: 8550.382 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 463/ 292968 | consumed samples: 948224 | consumed tokens: 69189632 | elapsed time per iteration (ms): 110547.9 | learning rate: 2.529E-05 | global batch size: 2048 | lm loss: 5.764702E+00 | loss scale: 4096.0 | grad norm: 7714.266 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 464/ 292968 | consumed samples: 950272 | consumed tokens: 69369856 | elapsed time per iteration (ms): 111703.3 | learning rate: 2.534E-05 | global batch size: 2048 | lm loss: 5.773074E+00 | loss scale: 4096.0 | grad norm: 5797.288 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 465/ 292968 | consumed samples: 952320 | consumed tokens: 69550080 | elapsed time per iteration (ms): 110500.5 | learning rate: 2.540E-05 | global batch size: 2048 | lm loss: 5.771196E+00 | loss scale: 4096.0 | grad norm: 5840.679 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 466/ 292968 | consumed samples: 954368 | consumed tokens: 69730304 | elapsed time per iteration (ms): 111266.6 | learning rate: 2.545E-05 | global batch size: 2048 | lm loss: 5.767781E+00 | loss scale: 4096.0 | grad norm: 11007.546 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 467/ 292968 | consumed samples: 956416 | consumed tokens: 69910528 | elapsed time per iteration (ms): 111803.1 | learning rate: 2.550E-05 | global batch size: 2048 | lm loss: 5.789933E+00 | loss scale: 4096.0 | grad norm: 7147.749 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 468/ 292968 | consumed samples: 958464 | consumed tokens: 70090752 | elapsed time per iteration (ms): 111683.8 | learning rate: 2.556E-05 | global batch size: 2048 | lm loss: 5.777902E+00 | loss scale: 4096.0 | grad norm: 7919.103 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 469/ 292968 | consumed samples: 960512 | consumed tokens: 70270976 | elapsed time per iteration (ms): 112309.7 | learning rate: 2.561E-05 | global batch size: 2048 | lm loss: 5.717086E+00 | loss scale: 4096.0 | grad norm: 5935.173 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 470/ 292968 | consumed samples: 962560 | consumed tokens: 70451200 | elapsed time per iteration (ms): 110157.1 | learning rate: 2.567E-05 | global batch size: 2048 | lm loss: 5.732512E+00 | loss scale: 4096.0 | grad norm: 6728.798 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 471/ 292968 | consumed samples: 964608 | consumed tokens: 70631424 | elapsed time per iteration (ms): 110509.7 | learning rate: 2.572E-05 | global batch size: 2048 | lm loss: 5.737529E+00 | loss scale: 4096.0 | grad norm: 5937.898 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 472/ 292968 | consumed samples: 966656 | consumed tokens: 70811648 | elapsed time per iteration (ms): 110367.2 | learning rate: 2.578E-05 | global batch size: 2048 | lm loss: 5.735291E+00 | loss scale: 4096.0 | grad norm: 7384.296 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 473/ 292968 | consumed samples: 968704 | consumed tokens: 70991872 | elapsed time per iteration (ms): 110163.7 | learning rate: 2.583E-05 | global batch size: 2048 | lm loss: 5.755392E+00 | loss scale: 4096.0 | grad norm: 6461.042 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 474/ 292968 | consumed samples: 970752 | consumed tokens: 71172096 | elapsed time per iteration (ms): 110298.3 | learning rate: 2.589E-05 | global batch size: 2048 | lm loss: 5.782510E+00 | loss scale: 4096.0 | grad norm: 7188.657 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 475/ 292968 | consumed samples: 972800 | consumed tokens: 71352320 | elapsed time per iteration (ms): 110376.1 | learning rate: 2.594E-05 | global batch size: 2048 | lm loss: 5.740060E+00 | loss scale: 4096.0 | grad norm: 8687.536 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 476/ 292968 | consumed samples: 974848 | consumed tokens: 71532544 | elapsed time per iteration (ms): 111196.0 | learning rate: 2.600E-05 | global batch size: 2048 | lm loss: 5.759412E+00 | loss scale: 4096.0 | grad norm: 6746.615 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 477/ 292968 | consumed samples: 976896 | consumed tokens: 71712768 | elapsed time per iteration (ms): 110984.8 | learning rate: 2.605E-05 | global batch size: 2048 | lm loss: 5.743295E+00 | loss scale: 4096.0 | grad norm: 6837.263 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 478/ 292968 | consumed samples: 978944 | consumed tokens: 71892992 | elapsed time per iteration (ms): 109018.1 | learning rate: 2.611E-05 | global batch size: 2048 | lm loss: 5.736754E+00 | loss scale: 4096.0 | grad norm: 6487.576 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 479/ 292968 | consumed samples: 980992 | consumed tokens: 72073216 | elapsed time per iteration (ms): 110343.4 | learning rate: 2.616E-05 | global batch size: 2048 | lm loss: 5.747668E+00 | loss scale: 4096.0 | grad norm: 8492.173 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 480/ 292968 | consumed samples: 983040 | consumed tokens: 72253440 | elapsed time per iteration (ms): 112724.2 | learning rate: 2.621E-05 | global batch size: 2048 | lm loss: 5.731270E+00 | loss scale: 4096.0 | grad norm: 6825.831 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 481/ 292968 | consumed samples: 985088 | consumed tokens: 72433664 | elapsed time per iteration (ms): 111444.0 | learning rate: 2.627E-05 | global batch size: 2048 | lm loss: 5.745525E+00 | loss scale: 4096.0 | grad norm: 5987.143 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 482/ 292968 | consumed samples: 987136 | consumed tokens: 72613888 | elapsed time per iteration (ms): 111732.8 | learning rate: 2.632E-05 | global batch size: 2048 | lm loss: 5.711495E+00 | loss scale: 4096.0 | grad norm: 6874.974 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 483/ 292968 | consumed samples: 989184 | consumed tokens: 72794112 | elapsed time per iteration (ms): 112960.6 | learning rate: 2.638E-05 | global batch size: 2048 | lm loss: 5.745270E+00 | loss scale: 4096.0 | grad norm: 6884.282 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 484/ 292968 | consumed samples: 991232 | consumed tokens: 72974336 | elapsed time per iteration (ms): 109723.3 | learning rate: 2.643E-05 | global batch size: 2048 | lm loss: 5.718277E+00 | loss scale: 4096.0 | grad norm: 6478.191 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 485/ 292968 | consumed samples: 993280 | consumed tokens: 73154560 | elapsed time per iteration (ms): 113336.5 | learning rate: 2.649E-05 | global batch size: 2048 | lm loss: 5.686126E+00 | loss scale: 4096.0 | grad norm: 5766.168 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 486/ 292968 | consumed samples: 995328 | consumed tokens: 73334784 | elapsed time per iteration (ms): 112106.0 | learning rate: 2.654E-05 | global batch size: 2048 | lm loss: 5.711407E+00 | loss scale: 4096.0 | grad norm: 6108.886 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 487/ 292968 | consumed samples: 997376 | consumed tokens: 73515008 | elapsed time per iteration (ms): 111475.1 | learning rate: 2.660E-05 | global batch size: 2048 | lm loss: 5.688071E+00 | loss scale: 4096.0 | grad norm: 6007.896 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 488/ 292968 | consumed samples: 999424 | consumed tokens: 73695232 | elapsed time per iteration (ms): 111752.6 | learning rate: 2.665E-05 | global batch size: 2048 | lm loss: 5.690403E+00 | loss scale: 4096.0 | grad norm: 7149.547 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 489/ 292968 | consumed samples: 1001472 | consumed tokens: 73875456 | elapsed time per iteration (ms): 111322.4 | learning rate: 2.671E-05 | global batch size: 2048 | lm loss: 5.661258E+00 | loss scale: 4096.0 | grad norm: 6795.608 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 490/ 292968 | consumed samples: 1003520 | consumed tokens: 74055680 | elapsed time per iteration (ms): 110976.8 | learning rate: 2.676E-05 | global batch size: 2048 | lm loss: 5.681107E+00 | loss scale: 4096.0 | grad norm: 8144.001 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 491/ 292968 | consumed samples: 1005568 | consumed tokens: 74235904 | elapsed time per iteration (ms): 112743.6 | learning rate: 2.682E-05 | global batch size: 2048 | lm loss: 5.714880E+00 | loss scale: 4096.0 | grad norm: 5797.093 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 492/ 292968 | consumed samples: 1007616 | consumed tokens: 74416128 | elapsed time per iteration (ms): 113321.0 | learning rate: 2.687E-05 | global batch size: 2048 | lm loss: 5.666462E+00 | loss scale: 4096.0 | grad norm: 9436.325 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 493/ 292968 | consumed samples: 1009664 | consumed tokens: 74596352 | elapsed time per iteration (ms): 111039.2 | learning rate: 2.692E-05 | global batch size: 2048 | lm loss: 5.684762E+00 | loss scale: 4096.0 | grad norm: 6744.780 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 494/ 292968 | consumed samples: 1011712 | consumed tokens: 74776576 | elapsed time per iteration (ms): 113015.0 | learning rate: 2.698E-05 | global batch size: 2048 | lm loss: 5.680274E+00 | loss scale: 4096.0 | grad norm: 7683.869 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 495/ 292968 | consumed samples: 1013760 | consumed tokens: 74956800 | elapsed time per iteration (ms): 111558.7 | learning rate: 2.703E-05 | global batch size: 2048 | lm loss: 5.659842E+00 | loss scale: 4096.0 | grad norm: 5214.174 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 496/ 292968 | consumed samples: 1015808 | consumed tokens: 75137024 | elapsed time per iteration (ms): 112430.7 | learning rate: 2.709E-05 | global batch size: 2048 | lm loss: 5.694101E+00 | loss scale: 4096.0 | grad norm: 8412.757 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 497/ 292968 | consumed samples: 1017856 | consumed tokens: 75317248 | elapsed time per iteration (ms): 109975.2 | learning rate: 2.714E-05 | global batch size: 2048 | lm loss: 5.656071E+00 | loss scale: 4096.0 | grad norm: 5692.706 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 498/ 292968 | consumed samples: 1019904 | consumed tokens: 75497472 | elapsed time per iteration (ms): 112798.1 | learning rate: 2.720E-05 | global batch size: 2048 | lm loss: 5.697061E+00 | loss scale: 4096.0 | grad norm: 7366.636 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 499/ 292968 | consumed samples: 1021952 | consumed tokens: 75677696 | elapsed time per iteration (ms): 110248.7 | learning rate: 2.725E-05 | global batch size: 2048 | lm loss: 5.679244E+00 | loss scale: 4096.0 | grad norm: 7223.102 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 500/ 292968 | consumed samples: 1024000 | consumed tokens: 75857920 | elapsed time per iteration (ms): 112437.7 | learning rate: 2.731E-05 | global batch size: 2048 | lm loss: 5.666466E+00 | loss scale: 8192.0 | grad norm: 6671.781 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 501/ 292968 | consumed samples: 1026048 | consumed tokens: 76038144 | elapsed time per iteration (ms): 112664.7 | learning rate: 2.736E-05 | global batch size: 2048 | lm loss: 5.669477E+00 | loss scale: 8192.0 | grad norm: 14864.149 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 502/ 292968 | consumed samples: 1028096 | consumed tokens: 76218368 | elapsed time per iteration (ms): 113293.4 | learning rate: 2.742E-05 | global batch size: 2048 | lm loss: 5.667769E+00 | loss scale: 8192.0 | grad norm: 15154.050 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 503/ 292968 | consumed samples: 1030144 | consumed tokens: 76398592 | elapsed time per iteration (ms): 112921.0 | learning rate: 2.747E-05 | global batch size: 2048 | lm loss: 5.627787E+00 | loss scale: 8192.0 | grad norm: 10410.178 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 504/ 292968 | consumed samples: 1032192 | consumed tokens: 76578816 | elapsed time per iteration (ms): 113753.7 | learning rate: 2.753E-05 | global batch size: 2048 | lm loss: 5.639052E+00 | loss scale: 8192.0 | grad norm: 14485.500 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 505/ 292968 | consumed samples: 1034240 | consumed tokens: 76759040 | elapsed time per iteration (ms): 110326.5 | learning rate: 2.758E-05 | global batch size: 2048 | lm loss: 5.631787E+00 | loss scale: 8192.0 | grad norm: 10104.848 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 506/ 292968 | consumed samples: 1036288 | consumed tokens: 76939264 | elapsed time per iteration (ms): 112918.3 | learning rate: 2.763E-05 | global batch size: 2048 | lm loss: 5.668808E+00 | loss scale: 8192.0 | grad norm: 16685.030 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 507/ 292968 | consumed samples: 1038336 | consumed tokens: 77119488 | elapsed time per iteration (ms): 114531.8 | learning rate: 2.769E-05 | global batch size: 2048 | lm loss: 5.653332E+00 | loss scale: 8192.0 | grad norm: 13641.884 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 508/ 292968 | consumed samples: 1040384 | consumed tokens: 77299712 | elapsed time per iteration (ms): 112263.9 | learning rate: 2.774E-05 | global batch size: 2048 | lm loss: 5.617197E+00 | loss scale: 8192.0 | grad norm: 16726.282 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 509/ 292968 | consumed samples: 1042432 | consumed tokens: 77479936 | elapsed time per iteration (ms): 113825.5 | learning rate: 2.780E-05 | global batch size: 2048 | lm loss: 5.639387E+00 | loss scale: 8192.0 | grad norm: 13516.668 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 510/ 292968 | consumed samples: 1044480 | consumed tokens: 77660160 | elapsed time per iteration (ms): 111907.9 | learning rate: 2.785E-05 | global batch size: 2048 | lm loss: 5.616351E+00 | loss scale: 8192.0 | grad norm: 14983.754 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 511/ 292968 | consumed samples: 1046528 | consumed tokens: 77840384 | elapsed time per iteration (ms): 112638.4 | learning rate: 2.791E-05 | global batch size: 2048 | lm loss: 5.645296E+00 | loss scale: 8192.0 | grad norm: 11822.125 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 512/ 292968 | consumed samples: 1048576 | consumed tokens: 78020608 | elapsed time per iteration (ms): 113140.5 | learning rate: 2.796E-05 | global batch size: 2048 | lm loss: 5.661061E+00 | loss scale: 8192.0 | grad norm: 13954.044 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 513/ 292968 | consumed samples: 1050624 | consumed tokens: 78200832 | elapsed time per iteration (ms): 111429.9 | learning rate: 2.802E-05 | global batch size: 2048 | lm loss: 5.638104E+00 | loss scale: 8192.0 | grad norm: 13936.979 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 514/ 292968 | consumed samples: 1052672 | consumed tokens: 78381056 | elapsed time per iteration (ms): 112759.6 | learning rate: 2.807E-05 | global batch size: 2048 | lm loss: 5.619231E+00 | loss scale: 8192.0 | grad norm: 11471.279 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 515/ 292968 | consumed samples: 1054720 | consumed tokens: 78561280 | elapsed time per iteration (ms): 111594.0 | learning rate: 2.813E-05 | global batch size: 2048 | lm loss: 5.633333E+00 | loss scale: 8192.0 | grad norm: 17535.082 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 516/ 292968 | consumed samples: 1056768 | consumed tokens: 78741504 | elapsed time per iteration (ms): 112098.7 | learning rate: 2.818E-05 | global batch size: 2048 | lm loss: 5.663225E+00 | loss scale: 8192.0 | grad norm: 13864.695 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 517/ 292968 | consumed samples: 1058816 | consumed tokens: 78921728 | elapsed time per iteration (ms): 110623.1 | learning rate: 2.824E-05 | global batch size: 2048 | lm loss: 5.618065E+00 | loss scale: 8192.0 | grad norm: 11069.824 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 518/ 292968 | consumed samples: 1060864 | consumed tokens: 79101952 | elapsed time per iteration (ms): 111964.5 | learning rate: 2.829E-05 | global batch size: 2048 | lm loss: 5.635888E+00 | loss scale: 8192.0 | grad norm: 12731.825 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 519/ 292968 | consumed samples: 1062912 | consumed tokens: 79282176 | elapsed time per iteration (ms): 112055.2 | learning rate: 2.834E-05 | global batch size: 2048 | lm loss: 5.601050E+00 | loss scale: 8192.0 | grad norm: 11635.834 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 520/ 292968 | consumed samples: 1064960 | consumed tokens: 79462400 | elapsed time per iteration (ms): 112418.5 | learning rate: 2.840E-05 | global batch size: 2048 | lm loss: 5.645939E+00 | loss scale: 8192.0 | grad norm: 17715.201 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 521/ 292968 | consumed samples: 1067008 | consumed tokens: 79642624 | elapsed time per iteration (ms): 111558.5 | learning rate: 2.845E-05 | global batch size: 2048 | lm loss: 5.586247E+00 | loss scale: 8192.0 | grad norm: 9433.316 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 522/ 292968 | consumed samples: 1069056 | consumed tokens: 79822848 | elapsed time per iteration (ms): 113098.8 | learning rate: 2.851E-05 | global batch size: 2048 | lm loss: 5.607241E+00 | loss scale: 8192.0 | grad norm: 11954.691 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 523/ 292968 | consumed samples: 1071104 | consumed tokens: 80003072 | elapsed time per iteration (ms): 112106.8 | learning rate: 2.856E-05 | global batch size: 2048 | lm loss: 5.652853E+00 | loss scale: 8192.0 | grad norm: 16648.802 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 524/ 292968 | consumed samples: 1073152 | consumed tokens: 80183296 | elapsed time per iteration (ms): 112809.7 | learning rate: 2.862E-05 | global batch size: 2048 | lm loss: 5.599886E+00 | loss scale: 8192.0 | grad norm: 9193.022 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 525/ 292968 | consumed samples: 1075200 | consumed tokens: 80363520 | elapsed time per iteration (ms): 114026.4 | learning rate: 2.867E-05 | global batch size: 2048 | lm loss: 5.635831E+00 | loss scale: 8192.0 | grad norm: 22370.033 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 526/ 292968 | consumed samples: 1077248 | consumed tokens: 80543744 | elapsed time per iteration (ms): 112873.4 | learning rate: 2.873E-05 | global batch size: 2048 | lm loss: 5.630721E+00 | loss scale: 8192.0 | grad norm: 11212.895 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 527/ 292968 | consumed samples: 1079296 | consumed tokens: 80723968 | elapsed time per iteration (ms): 112562.2 | learning rate: 2.878E-05 | global batch size: 2048 | lm loss: 5.617833E+00 | loss scale: 8192.0 | grad norm: 16194.164 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 528/ 292968 | consumed samples: 1081344 | consumed tokens: 80904192 | elapsed time per iteration (ms): 112871.5 | learning rate: 2.884E-05 | global batch size: 2048 | lm loss: 5.614437E+00 | loss scale: 8192.0 | grad norm: 13321.010 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 529/ 292968 | consumed samples: 1083392 | consumed tokens: 81084416 | elapsed time per iteration (ms): 112230.3 | learning rate: 2.889E-05 | global batch size: 2048 | lm loss: 5.596371E+00 | loss scale: 8192.0 | grad norm: 9818.933 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 530/ 292968 | consumed samples: 1085440 | consumed tokens: 81264640 | elapsed time per iteration (ms): 111781.8 | learning rate: 2.895E-05 | global batch size: 2048 | lm loss: 5.628756E+00 | loss scale: 8192.0 | grad norm: 15970.761 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 531/ 292968 | consumed samples: 1087488 | consumed tokens: 81444864 | elapsed time per iteration (ms): 112070.8 | learning rate: 2.900E-05 | global batch size: 2048 | lm loss: 5.574606E+00 | loss scale: 8192.0 | grad norm: 12453.852 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 532/ 292968 | consumed samples: 1089536 | consumed tokens: 81625088 | elapsed time per iteration (ms): 111479.9 | learning rate: 2.905E-05 | global batch size: 2048 | lm loss: 5.553162E+00 | loss scale: 8192.0 | grad norm: 12601.321 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 533/ 292968 | consumed samples: 1091584 | consumed tokens: 81805312 | elapsed time per iteration (ms): 111390.0 | learning rate: 2.911E-05 | global batch size: 2048 | lm loss: 5.609733E+00 | loss scale: 8192.0 | grad norm: 13511.849 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 534/ 292968 | consumed samples: 1093632 | consumed tokens: 81985536 | elapsed time per iteration (ms): 112213.7 | learning rate: 2.916E-05 | global batch size: 2048 | lm loss: 5.583689E+00 | loss scale: 8192.0 | grad norm: 11190.455 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 535/ 292968 | consumed samples: 1095680 | consumed tokens: 82165760 | elapsed time per iteration (ms): 112993.1 | learning rate: 2.922E-05 | global batch size: 2048 | lm loss: 5.653582E+00 | loss scale: 8192.0 | grad norm: 20818.658 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 536/ 292968 | consumed samples: 1097728 | consumed tokens: 82345984 | elapsed time per iteration (ms): 112307.7 | learning rate: 2.927E-05 | global batch size: 2048 | lm loss: 5.611212E+00 | loss scale: 8192.0 | grad norm: 10362.696 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 537/ 292968 | consumed samples: 1099776 | consumed tokens: 82526208 | elapsed time per iteration (ms): 112970.5 | learning rate: 2.933E-05 | global batch size: 2048 | lm loss: 5.618240E+00 | loss scale: 8192.0 | grad norm: 14839.821 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 538/ 292968 | consumed samples: 1101824 | consumed tokens: 82706432 | elapsed time per iteration (ms): 113120.5 | learning rate: 2.938E-05 | global batch size: 2048 | lm loss: 5.594517E+00 | loss scale: 8192.0 | grad norm: 13605.480 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 539/ 292968 | consumed samples: 1103872 | consumed tokens: 82886656 | elapsed time per iteration (ms): 112476.6 | learning rate: 2.944E-05 | global batch size: 2048 | lm loss: 5.556248E+00 | loss scale: 8192.0 | grad norm: 13800.093 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 540/ 292968 | consumed samples: 1105920 | consumed tokens: 83066880 | elapsed time per iteration (ms): 114182.8 | learning rate: 2.949E-05 | global batch size: 2048 | lm loss: 5.591393E+00 | loss scale: 8192.0 | grad norm: 10588.037 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 541/ 292968 | consumed samples: 1107968 | consumed tokens: 83247104 | elapsed time per iteration (ms): 110876.2 | learning rate: 2.955E-05 | global batch size: 2048 | lm loss: 5.556509E+00 | loss scale: 8192.0 | grad norm: 13801.950 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 542/ 292968 | consumed samples: 1110016 | consumed tokens: 83427328 | elapsed time per iteration (ms): 111658.7 | learning rate: 2.960E-05 | global batch size: 2048 | lm loss: 5.569237E+00 | loss scale: 8192.0 | grad norm: 14005.832 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 543/ 292968 | consumed samples: 1112064 | consumed tokens: 83607552 | elapsed time per iteration (ms): 112214.1 | learning rate: 2.966E-05 | global batch size: 2048 | lm loss: 5.546272E+00 | loss scale: 8192.0 | grad norm: 11650.584 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 544/ 292968 | consumed samples: 1114112 | consumed tokens: 83787776 | elapsed time per iteration (ms): 113179.9 | learning rate: 2.971E-05 | global batch size: 2048 | lm loss: 5.549253E+00 | loss scale: 8192.0 | grad norm: 13630.378 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 545/ 292968 | consumed samples: 1116160 | consumed tokens: 83968000 | elapsed time per iteration (ms): 112602.2 | learning rate: 2.976E-05 | global batch size: 2048 | lm loss: 5.533734E+00 | loss scale: 8192.0 | grad norm: 10491.189 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 546/ 292968 | consumed samples: 1118208 | consumed tokens: 84148224 | elapsed time per iteration (ms): 112024.9 | learning rate: 2.982E-05 | global batch size: 2048 | lm loss: 5.555665E+00 | loss scale: 8192.0 | grad norm: 14130.965 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 547/ 292968 | consumed samples: 1120256 | consumed tokens: 84328448 | elapsed time per iteration (ms): 112655.1 | learning rate: 2.987E-05 | global batch size: 2048 | lm loss: 5.551611E+00 | loss scale: 8192.0 | grad norm: 12855.412 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 548/ 292968 | consumed samples: 1122304 | consumed tokens: 84508672 | elapsed time per iteration (ms): 111576.2 | learning rate: 2.993E-05 | global batch size: 2048 | lm loss: 5.609882E+00 | loss scale: 8192.0 | grad norm: 15275.244 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 549/ 292968 | consumed samples: 1124352 | consumed tokens: 84688896 | elapsed time per iteration (ms): 112577.9 | learning rate: 2.998E-05 | global batch size: 2048 | lm loss: 5.596916E+00 | loss scale: 8192.0 | grad norm: 13652.963 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 550/ 292968 | consumed samples: 1126400 | consumed tokens: 84869120 | elapsed time per iteration (ms): 112519.9 | learning rate: 3.004E-05 | global batch size: 2048 | lm loss: 5.550436E+00 | loss scale: 8192.0 | grad norm: 10479.605 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 551/ 292968 | consumed samples: 1128448 | consumed tokens: 85049344 | elapsed time per iteration (ms): 111692.6 | learning rate: 3.009E-05 | global batch size: 2048 | lm loss: 5.542852E+00 | loss scale: 8192.0 | grad norm: 18511.064 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 552/ 292968 | consumed samples: 1130496 | consumed tokens: 85229568 | elapsed time per iteration (ms): 111626.8 | learning rate: 3.015E-05 | global batch size: 2048 | lm loss: 5.529922E+00 | loss scale: 8192.0 | grad norm: 9669.866 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 553/ 292968 | consumed samples: 1132544 | consumed tokens: 85409792 | elapsed time per iteration (ms): 112073.7 | learning rate: 3.020E-05 | global batch size: 2048 | lm loss: 5.545301E+00 | loss scale: 8192.0 | grad norm: 12652.392 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 554/ 292968 | consumed samples: 1134592 | consumed tokens: 85590016 | elapsed time per iteration (ms): 111561.5 | learning rate: 3.026E-05 | global batch size: 2048 | lm loss: 5.548908E+00 | loss scale: 8192.0 | grad norm: 12234.313 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 555/ 292968 | consumed samples: 1136640 | consumed tokens: 85770240 | elapsed time per iteration (ms): 111056.4 | learning rate: 3.031E-05 | global batch size: 2048 | lm loss: 5.538098E+00 | loss scale: 8192.0 | grad norm: 12248.211 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 556/ 292968 | consumed samples: 1138688 | consumed tokens: 85950464 | elapsed time per iteration (ms): 111595.6 | learning rate: 3.037E-05 | global batch size: 2048 | lm loss: 5.537742E+00 | loss scale: 8192.0 | grad norm: 10560.271 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 557/ 292968 | consumed samples: 1140736 | consumed tokens: 86130688 | elapsed time per iteration (ms): 113191.6 | learning rate: 3.042E-05 | global batch size: 2048 | lm loss: 5.517148E+00 | loss scale: 8192.0 | grad norm: 14233.138 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 558/ 292968 | consumed samples: 1142784 | consumed tokens: 86310912 | elapsed time per iteration (ms): 112335.2 | learning rate: 3.047E-05 | global batch size: 2048 | lm loss: 5.566739E+00 | loss scale: 8192.0 | grad norm: 14225.350 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 559/ 292968 | consumed samples: 1144832 | consumed tokens: 86491136 | elapsed time per iteration (ms): 113204.0 | learning rate: 3.053E-05 | global batch size: 2048 | lm loss: 5.529708E+00 | loss scale: 8192.0 | grad norm: 9114.316 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 560/ 292968 | consumed samples: 1146880 | consumed tokens: 86671360 | elapsed time per iteration (ms): 111793.0 | learning rate: 3.058E-05 | global batch size: 2048 | lm loss: 5.541924E+00 | loss scale: 8192.0 | grad norm: 9695.972 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 561/ 292968 | consumed samples: 1148928 | consumed tokens: 86851584 | elapsed time per iteration (ms): 113028.7 | learning rate: 3.064E-05 | global batch size: 2048 | lm loss: 5.521393E+00 | loss scale: 8192.0 | grad norm: 11158.709 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 562/ 292968 | consumed samples: 1150976 | consumed tokens: 87031808 | elapsed time per iteration (ms): 111623.4 | learning rate: 3.069E-05 | global batch size: 2048 | lm loss: 5.501397E+00 | loss scale: 8192.0 | grad norm: 11525.341 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 563/ 292968 | consumed samples: 1153024 | consumed tokens: 87212032 | elapsed time per iteration (ms): 110973.8 | learning rate: 3.075E-05 | global batch size: 2048 | lm loss: 5.487821E+00 | loss scale: 8192.0 | grad norm: 12021.366 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 564/ 292968 | consumed samples: 1155072 | consumed tokens: 87392256 | elapsed time per iteration (ms): 113374.4 | learning rate: 3.080E-05 | global batch size: 2048 | lm loss: 5.480217E+00 | loss scale: 8192.0 | grad norm: 10903.562 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 565/ 292968 | consumed samples: 1157120 | consumed tokens: 87572480 | elapsed time per iteration (ms): 112996.4 | learning rate: 3.086E-05 | global batch size: 2048 | lm loss: 5.499344E+00 | loss scale: 8192.0 | grad norm: 10305.931 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 566/ 292968 | consumed samples: 1159168 | consumed tokens: 87752704 | elapsed time per iteration (ms): 112129.1 | learning rate: 3.091E-05 | global batch size: 2048 | lm loss: 5.520879E+00 | loss scale: 8192.0 | grad norm: 12505.504 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 567/ 292968 | consumed samples: 1161216 | consumed tokens: 87932928 | elapsed time per iteration (ms): 112661.0 | learning rate: 3.097E-05 | global batch size: 2048 | lm loss: 5.531937E+00 | loss scale: 8192.0 | grad norm: 14944.754 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 568/ 292968 | consumed samples: 1163264 | consumed tokens: 88113152 | elapsed time per iteration (ms): 113956.6 | learning rate: 3.102E-05 | global batch size: 2048 | lm loss: 5.497797E+00 | loss scale: 8192.0 | grad norm: 11478.429 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 569/ 292968 | consumed samples: 1165312 | consumed tokens: 88293376 | elapsed time per iteration (ms): 112649.6 | learning rate: 3.107E-05 | global batch size: 2048 | lm loss: 5.505655E+00 | loss scale: 8192.0 | grad norm: 13474.430 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 570/ 292968 | consumed samples: 1167360 | consumed tokens: 88473600 | elapsed time per iteration (ms): 111252.0 | learning rate: 3.113E-05 | global batch size: 2048 | lm loss: 5.493463E+00 | loss scale: 8192.0 | grad norm: 14819.370 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 571/ 292968 | consumed samples: 1169408 | consumed tokens: 88653824 | elapsed time per iteration (ms): 112373.6 | learning rate: 3.118E-05 | global batch size: 2048 | lm loss: 5.485642E+00 | loss scale: 8192.0 | grad norm: 7874.211 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 572/ 292968 | consumed samples: 1171456 | consumed tokens: 88834048 | elapsed time per iteration (ms): 112530.1 | learning rate: 3.124E-05 | global batch size: 2048 | lm loss: 5.480896E+00 | loss scale: 8192.0 | grad norm: 14748.807 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 573/ 292968 | consumed samples: 1173504 | consumed tokens: 89014272 | elapsed time per iteration (ms): 111003.6 | learning rate: 3.129E-05 | global batch size: 2048 | lm loss: 5.495447E+00 | loss scale: 8192.0 | grad norm: 11089.801 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 574/ 292968 | consumed samples: 1175552 | consumed tokens: 89194496 | elapsed time per iteration (ms): 112117.2 | learning rate: 3.135E-05 | global batch size: 2048 | lm loss: 5.516068E+00 | loss scale: 8192.0 | grad norm: 15890.094 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 575/ 292968 | consumed samples: 1177600 | consumed tokens: 89374720 | elapsed time per iteration (ms): 113068.6 | learning rate: 3.140E-05 | global batch size: 2048 | lm loss: 5.471289E+00 | loss scale: 8192.0 | grad norm: 10932.631 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 576/ 292968 | consumed samples: 1179648 | consumed tokens: 89554944 | elapsed time per iteration (ms): 111584.4 | learning rate: 3.146E-05 | global batch size: 2048 | lm loss: 5.460034E+00 | loss scale: 8192.0 | grad norm: 14436.227 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 577/ 292968 | consumed samples: 1181696 | consumed tokens: 89735168 | elapsed time per iteration (ms): 113415.4 | learning rate: 3.151E-05 | global batch size: 2048 | lm loss: 5.467341E+00 | loss scale: 8192.0 | grad norm: 9677.502 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 578/ 292968 | consumed samples: 1183744 | consumed tokens: 89915392 | elapsed time per iteration (ms): 112958.8 | learning rate: 3.157E-05 | global batch size: 2048 | lm loss: 5.456917E+00 | loss scale: 8192.0 | grad norm: 16119.399 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 579/ 292968 | consumed samples: 1185792 | consumed tokens: 90095616 | elapsed time per iteration (ms): 111312.2 | learning rate: 3.162E-05 | global batch size: 2048 | lm loss: 5.460016E+00 | loss scale: 8192.0 | grad norm: 12161.697 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 580/ 292968 | consumed samples: 1187840 | consumed tokens: 90275840 | elapsed time per iteration (ms): 112441.1 | learning rate: 3.168E-05 | global batch size: 2048 | lm loss: 5.463281E+00 | loss scale: 8192.0 | grad norm: 12047.781 | num zeros: 0.0 | curriculum seqlen: 88 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 581/ 292968 | consumed samples: 1189888 | consumed tokens: 90472448 | elapsed time per iteration (ms): 110471.3 | learning rate: 3.173E-05 | global batch size: 2048 | lm loss: 5.491323E+00 | loss scale: 8192.0 | grad norm: 11849.322 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 582/ 292968 | consumed samples: 1191936 | consumed tokens: 90669056 | elapsed time per iteration (ms): 108157.8 | learning rate: 3.178E-05 | global batch size: 2048 | lm loss: 5.475502E+00 | loss scale: 8192.0 | grad norm: 10832.692 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 583/ 292968 | consumed samples: 1193984 | consumed tokens: 90865664 | elapsed time per iteration (ms): 108967.2 | learning rate: 3.184E-05 | global batch size: 2048 | lm loss: 5.494294E+00 | loss scale: 8192.0 | grad norm: 14744.932 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 584/ 292968 | consumed samples: 1196032 | consumed tokens: 91062272 | elapsed time per iteration (ms): 106812.8 | learning rate: 3.189E-05 | global batch size: 2048 | lm loss: 5.487658E+00 | loss scale: 8192.0 | grad norm: 8967.567 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 585/ 292968 | consumed samples: 1198080 | consumed tokens: 91258880 | elapsed time per iteration (ms): 110130.1 | learning rate: 3.195E-05 | global batch size: 2048 | lm loss: 5.488459E+00 | loss scale: 8192.0 | grad norm: 14768.019 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 586/ 292968 | consumed samples: 1200128 | consumed tokens: 91455488 | elapsed time per iteration (ms): 106231.0 | learning rate: 3.200E-05 | global batch size: 2048 | lm loss: 5.488029E+00 | loss scale: 8192.0 | grad norm: 13756.417 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 587/ 292968 | consumed samples: 1202176 | consumed tokens: 91652096 | elapsed time per iteration (ms): 106565.7 | learning rate: 3.206E-05 | global batch size: 2048 | lm loss: 5.448896E+00 | loss scale: 8192.0 | grad norm: 8670.093 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 588/ 292968 | consumed samples: 1204224 | consumed tokens: 91848704 | elapsed time per iteration (ms): 106823.5 | learning rate: 3.211E-05 | global batch size: 2048 | lm loss: 5.481108E+00 | loss scale: 8192.0 | grad norm: 13747.563 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 589/ 292968 | consumed samples: 1206272 | consumed tokens: 92045312 | elapsed time per iteration (ms): 109210.1 | learning rate: 3.217E-05 | global batch size: 2048 | lm loss: 5.483897E+00 | loss scale: 8192.0 | grad norm: 13030.572 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 590/ 292968 | consumed samples: 1208320 | consumed tokens: 92241920 | elapsed time per iteration (ms): 107071.2 | learning rate: 3.222E-05 | global batch size: 2048 | lm loss: 5.499794E+00 | loss scale: 8192.0 | grad norm: 12956.695 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 591/ 292968 | consumed samples: 1210368 | consumed tokens: 92438528 | elapsed time per iteration (ms): 107481.3 | learning rate: 3.228E-05 | global batch size: 2048 | lm loss: 5.458858E+00 | loss scale: 8192.0 | grad norm: 8716.189 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 592/ 292968 | consumed samples: 1212416 | consumed tokens: 92635136 | elapsed time per iteration (ms): 108187.6 | learning rate: 3.233E-05 | global batch size: 2048 | lm loss: 5.468006E+00 | loss scale: 8192.0 | grad norm: 10982.591 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 593/ 292968 | consumed samples: 1214464 | consumed tokens: 92831744 | elapsed time per iteration (ms): 107146.7 | learning rate: 3.239E-05 | global batch size: 2048 | lm loss: 5.428665E+00 | loss scale: 8192.0 | grad norm: 10539.232 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 594/ 292968 | consumed samples: 1216512 | consumed tokens: 93028352 | elapsed time per iteration (ms): 110124.1 | learning rate: 3.244E-05 | global batch size: 2048 | lm loss: 5.442387E+00 | loss scale: 8192.0 | grad norm: 13381.277 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 595/ 292968 | consumed samples: 1218560 | consumed tokens: 93224960 | elapsed time per iteration (ms): 106387.0 | learning rate: 3.249E-05 | global batch size: 2048 | lm loss: 5.484375E+00 | loss scale: 8192.0 | grad norm: 11482.399 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 596/ 292968 | consumed samples: 1220608 | consumed tokens: 93421568 | elapsed time per iteration (ms): 108330.7 | learning rate: 3.255E-05 | global batch size: 2048 | lm loss: 5.424896E+00 | loss scale: 8192.0 | grad norm: 12097.178 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 597/ 292968 | consumed samples: 1222656 | consumed tokens: 93618176 | elapsed time per iteration (ms): 107065.9 | learning rate: 3.260E-05 | global batch size: 2048 | lm loss: 5.433896E+00 | loss scale: 8192.0 | grad norm: 15293.672 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 598/ 292968 | consumed samples: 1224704 | consumed tokens: 93814784 | elapsed time per iteration (ms): 106989.0 | learning rate: 3.266E-05 | global batch size: 2048 | lm loss: 5.436405E+00 | loss scale: 8192.0 | grad norm: 11111.761 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 599/ 292968 | consumed samples: 1226752 | consumed tokens: 94011392 | elapsed time per iteration (ms): 106858.4 | learning rate: 3.271E-05 | global batch size: 2048 | lm loss: 5.414397E+00 | loss scale: 8192.0 | grad norm: 13962.838 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 600/ 292968 | consumed samples: 1228800 | consumed tokens: 94208000 | elapsed time per iteration (ms): 107260.3 | learning rate: 3.277E-05 | global batch size: 2048 | lm loss: 5.419570E+00 | loss scale: 8192.0 | grad norm: 11387.759 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------ - validation loss at iteration 600 | lm loss value: 5.387414E+00 | lm loss PPL: 2.186374E+02 | ------------------------------------------------------------------------------------------------ -saving checkpoint at iteration 600 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-25 06:13:42,645] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/mp_rank_01_model_states.pt -[2021-10-25 06:13:43,582] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/mp_rank_00_model_states.pt -[2021-10-25 06:13:56,312] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-25 06:13:56,315] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-25 06:13:56,357] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-25 06:13:56,388] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-25 06:13:56,529] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-25 06:13:56,535] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-25 06:13:56,537] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-25 06:13:56,588] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-25 06:13:56,617] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-25 06:13:56,621] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-25 06:13:56,637] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-25 06:13:56,677] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-25 06:13:56,683] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-25 06:13:56,689] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-25 06:13:56,697] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-25 06:13:56,728] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-25 06:13:56,728] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-25 06:13:56,732] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-25 06:13:56,733] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-25 06:13:56,792] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-25 06:13:56,816] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-25 06:13:56,829] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-25 06:13:56,831] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-25 06:13:56,978] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-25 06:13:56,996] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-25 06:13:57,041] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-25 06:13:57,081] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-25 06:13:57,115] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-25 06:13:57,128] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-25 06:13:57,137] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-25 06:13:57,180] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-25 06:13:57,254] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-25 06:13:57,303] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-25 06:13:57,320] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-25 06:13:57,392] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-25 06:13:57,413] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-25 06:13:57,508] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-25 06:13:57,526] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-25 06:13:57,563] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-25 06:13:57,565] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-25 06:13:57,579] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-25 06:13:57,583] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-25 06:13:57,600] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-25 06:13:57,646] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-25 06:13:57,657] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-25 06:13:57,668] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-25 06:13:57,669] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-25 06:13:57,673] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-25 06:13:57,679] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-25 06:13:57,686] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-25 06:13:57,694] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-25 06:13:57,698] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-25 06:13:57,714] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-25 06:13:57,730] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-25 06:13:57,736] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-25 06:13:57,758] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-25 06:13:57,763] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-25 06:13:57,774] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-25 06:13:57,780] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-25 06:13:57,813] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-25 06:13:57,822] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-25 06:13:57,837] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-25 06:13:57,838] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-25 06:13:57,841] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-25 06:13:57,852] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-25 06:13:57,861] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-25 06:13:57,872] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-25 06:13:57,876] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-25 06:13:57,885] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-25 06:13:57,886] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-25 06:13:57,889] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-25 06:13:57,924] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-25 06:13:57,927] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-25 06:13:57,943] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-25 06:13:57,948] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-25 06:13:57,948] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-25 06:13:57,950] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-25 06:13:57,969] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-25 06:13:57,975] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-25 06:13:57,992] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-25 06:13:58,013] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-25 06:13:58,052] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-25 06:13:58,074] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-25 06:13:58,092] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-25 06:13:58,098] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-25 06:13:58,124] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-25 06:13:58,139] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-25 06:13:58,153] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-25 06:13:58,168] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-25 06:13:58,258] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-25 06:13:58,275] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-25 06:13:58,282] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-25 06:13:58,327] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-25 06:13:58,364] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-25 06:13:58,375] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-25 06:13:58,404] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-25 06:13:58,420] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-25 06:13:58,425] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-25 06:13:58,453] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-25 06:13:58,456] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-25 06:13:58,476] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-25 06:13:58,505] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-25 06:13:58,532] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-25 06:13:58,578] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-25 06:13:58,585] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-25 06:13:58,601] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-25 06:13:58,640] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-25 06:13:58,679] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-25 06:13:58,698] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-25 06:13:58,757] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-25 06:13:58,810] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-25 06:13:58,824] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-25 06:13:58,831] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-25 06:13:58,832] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-25 06:13:58,876] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-25 06:13:59,037] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-25 06:13:59,405] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-25 06:13:59,502] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-25 06:14:00,054] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-25 06:14:00,433] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-25 06:14:00,677] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-25 06:14:00,974] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-25 06:14:04,943] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-25 06:14:05,563] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-25 06:14:06,200] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-25 06:14:06,857] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-25 06:14:12,792] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-25 06:14:13,515] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step600/zero_pp_rank_0_mp_rank_00_optim_states.pt - successfully saved checkpoint at iteration 600 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 34761.21 - iteration 601/ 292968 | consumed samples: 1230848 | consumed tokens: 94404608 | elapsed time per iteration (ms): 304940.5 | learning rate: 3.282E-05 | global batch size: 2048 | lm loss: 5.396969E+00 | loss scale: 8192.0 | grad norm: 12332.412 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 602/ 292968 | consumed samples: 1232896 | consumed tokens: 94601216 | elapsed time per iteration (ms): 106807.5 | learning rate: 3.288E-05 | global batch size: 2048 | lm loss: 5.408408E+00 | loss scale: 8192.0 | grad norm: 11929.351 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 603/ 292968 | consumed samples: 1234944 | consumed tokens: 94797824 | elapsed time per iteration (ms): 107857.1 | learning rate: 3.293E-05 | global batch size: 2048 | lm loss: 5.420089E+00 | loss scale: 8192.0 | grad norm: 11171.102 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 604/ 292968 | consumed samples: 1236992 | consumed tokens: 94994432 | elapsed time per iteration (ms): 107461.0 | learning rate: 3.299E-05 | global batch size: 2048 | lm loss: 5.418396E+00 | loss scale: 8192.0 | grad norm: 9342.805 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 605/ 292968 | consumed samples: 1239040 | consumed tokens: 95191040 | elapsed time per iteration (ms): 107939.7 | learning rate: 3.304E-05 | global batch size: 2048 | lm loss: 5.415629E+00 | loss scale: 8192.0 | grad norm: 12331.412 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 606/ 292968 | consumed samples: 1241088 | consumed tokens: 95387648 | elapsed time per iteration (ms): 106693.6 | learning rate: 3.310E-05 | global batch size: 2048 | lm loss: 5.435667E+00 | loss scale: 8192.0 | grad norm: 16086.731 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 607/ 292968 | consumed samples: 1243136 | consumed tokens: 95584256 | elapsed time per iteration (ms): 107708.8 | learning rate: 3.315E-05 | global batch size: 2048 | lm loss: 5.409382E+00 | loss scale: 8192.0 | grad norm: 9374.954 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 608/ 292968 | consumed samples: 1245184 | consumed tokens: 95780864 | elapsed time per iteration (ms): 107679.7 | learning rate: 3.320E-05 | global batch size: 2048 | lm loss: 5.423688E+00 | loss scale: 8192.0 | grad norm: 12232.800 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 609/ 292968 | consumed samples: 1247232 | consumed tokens: 95977472 | elapsed time per iteration (ms): 108222.9 | learning rate: 3.326E-05 | global batch size: 2048 | lm loss: 5.402236E+00 | loss scale: 8192.0 | grad norm: 9228.233 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 610/ 292968 | consumed samples: 1249280 | consumed tokens: 96174080 | elapsed time per iteration (ms): 107400.0 | learning rate: 3.331E-05 | global batch size: 2048 | lm loss: 5.412461E+00 | loss scale: 8192.0 | grad norm: 11245.757 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 611/ 292968 | consumed samples: 1251328 | consumed tokens: 96370688 | elapsed time per iteration (ms): 106468.7 | learning rate: 3.337E-05 | global batch size: 2048 | lm loss: 5.408649E+00 | loss scale: 8192.0 | grad norm: 11344.448 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 612/ 292968 | consumed samples: 1253376 | consumed tokens: 96567296 | elapsed time per iteration (ms): 107650.3 | learning rate: 3.342E-05 | global batch size: 2048 | lm loss: 5.407639E+00 | loss scale: 8192.0 | grad norm: 11098.585 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 613/ 292968 | consumed samples: 1255424 | consumed tokens: 96763904 | elapsed time per iteration (ms): 107751.1 | learning rate: 3.348E-05 | global batch size: 2048 | lm loss: 5.380627E+00 | loss scale: 8192.0 | grad norm: 8762.937 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 614/ 292968 | consumed samples: 1257472 | consumed tokens: 96960512 | elapsed time per iteration (ms): 110635.4 | learning rate: 3.353E-05 | global batch size: 2048 | lm loss: 5.375699E+00 | loss scale: 8192.0 | grad norm: 11229.270 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 615/ 292968 | consumed samples: 1259520 | consumed tokens: 97157120 | elapsed time per iteration (ms): 108098.9 | learning rate: 3.359E-05 | global batch size: 2048 | lm loss: 5.363403E+00 | loss scale: 8192.0 | grad norm: 10400.184 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 616/ 292968 | consumed samples: 1261568 | consumed tokens: 97353728 | elapsed time per iteration (ms): 109329.1 | learning rate: 3.364E-05 | global batch size: 2048 | lm loss: 5.384151E+00 | loss scale: 8192.0 | grad norm: 12453.326 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 617/ 292968 | consumed samples: 1263616 | consumed tokens: 97550336 | elapsed time per iteration (ms): 107222.2 | learning rate: 3.370E-05 | global batch size: 2048 | lm loss: 5.365817E+00 | loss scale: 8192.0 | grad norm: 12017.613 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 618/ 292968 | consumed samples: 1265664 | consumed tokens: 97746944 | elapsed time per iteration (ms): 107139.4 | learning rate: 3.375E-05 | global batch size: 2048 | lm loss: 5.358659E+00 | loss scale: 8192.0 | grad norm: 9650.822 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 619/ 292968 | consumed samples: 1267712 | consumed tokens: 97943552 | elapsed time per iteration (ms): 107963.7 | learning rate: 3.381E-05 | global batch size: 2048 | lm loss: 5.360062E+00 | loss scale: 8192.0 | grad norm: 9182.645 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 620/ 292968 | consumed samples: 1269760 | consumed tokens: 98140160 | elapsed time per iteration (ms): 106941.4 | learning rate: 3.386E-05 | global batch size: 2048 | lm loss: 5.350104E+00 | loss scale: 8192.0 | grad norm: 10388.823 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 621/ 292968 | consumed samples: 1271808 | consumed tokens: 98336768 | elapsed time per iteration (ms): 108728.6 | learning rate: 3.391E-05 | global batch size: 2048 | lm loss: 5.330681E+00 | loss scale: 8192.0 | grad norm: 10010.116 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 622/ 292968 | consumed samples: 1273856 | consumed tokens: 98533376 | elapsed time per iteration (ms): 107843.5 | learning rate: 3.397E-05 | global batch size: 2048 | lm loss: 5.387991E+00 | loss scale: 8192.0 | grad norm: 11984.058 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 623/ 292968 | consumed samples: 1275904 | consumed tokens: 98729984 | elapsed time per iteration (ms): 107380.4 | learning rate: 3.402E-05 | global batch size: 2048 | lm loss: 5.347582E+00 | loss scale: 8192.0 | grad norm: 9513.099 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 624/ 292968 | consumed samples: 1277952 | consumed tokens: 98926592 | elapsed time per iteration (ms): 108875.1 | learning rate: 3.408E-05 | global batch size: 2048 | lm loss: 5.360654E+00 | loss scale: 8192.0 | grad norm: 11778.551 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 625/ 292968 | consumed samples: 1280000 | consumed tokens: 99123200 | elapsed time per iteration (ms): 106579.6 | learning rate: 3.413E-05 | global batch size: 2048 | lm loss: 5.373547E+00 | loss scale: 8192.0 | grad norm: 10277.204 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 626/ 292968 | consumed samples: 1282048 | consumed tokens: 99319808 | elapsed time per iteration (ms): 109385.4 | learning rate: 3.419E-05 | global batch size: 2048 | lm loss: 5.341951E+00 | loss scale: 8192.0 | grad norm: 10174.799 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 627/ 292968 | consumed samples: 1284096 | consumed tokens: 99516416 | elapsed time per iteration (ms): 107213.8 | learning rate: 3.424E-05 | global batch size: 2048 | lm loss: 5.362940E+00 | loss scale: 8192.0 | grad norm: 10631.689 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 628/ 292968 | consumed samples: 1286144 | consumed tokens: 99713024 | elapsed time per iteration (ms): 108581.1 | learning rate: 3.430E-05 | global batch size: 2048 | lm loss: 5.395461E+00 | loss scale: 8192.0 | grad norm: 12382.653 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 629/ 292968 | consumed samples: 1288192 | consumed tokens: 99909632 | elapsed time per iteration (ms): 108292.6 | learning rate: 3.435E-05 | global batch size: 2048 | lm loss: 5.370893E+00 | loss scale: 8192.0 | grad norm: 9780.522 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 630/ 292968 | consumed samples: 1290240 | consumed tokens: 100106240 | elapsed time per iteration (ms): 106744.8 | learning rate: 3.441E-05 | global batch size: 2048 | lm loss: 5.326004E+00 | loss scale: 8192.0 | grad norm: 12227.046 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 631/ 292968 | consumed samples: 1292288 | consumed tokens: 100302848 | elapsed time per iteration (ms): 107582.1 | learning rate: 3.446E-05 | global batch size: 2048 | lm loss: 5.340735E+00 | loss scale: 8192.0 | grad norm: 11877.257 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 632/ 292968 | consumed samples: 1294336 | consumed tokens: 100499456 | elapsed time per iteration (ms): 107181.5 | learning rate: 3.452E-05 | global batch size: 2048 | lm loss: 5.347682E+00 | loss scale: 8192.0 | grad norm: 12827.897 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 633/ 292968 | consumed samples: 1296384 | consumed tokens: 100696064 | elapsed time per iteration (ms): 107386.1 | learning rate: 3.457E-05 | global batch size: 2048 | lm loss: 5.321402E+00 | loss scale: 8192.0 | grad norm: 10107.434 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 634/ 292968 | consumed samples: 1298432 | consumed tokens: 100892672 | elapsed time per iteration (ms): 107175.9 | learning rate: 3.462E-05 | global batch size: 2048 | lm loss: 5.320929E+00 | loss scale: 8192.0 | grad norm: 8954.510 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 635/ 292968 | consumed samples: 1300480 | consumed tokens: 101089280 | elapsed time per iteration (ms): 107956.8 | learning rate: 3.468E-05 | global batch size: 2048 | lm loss: 5.306052E+00 | loss scale: 8192.0 | grad norm: 11726.553 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 636/ 292968 | consumed samples: 1302528 | consumed tokens: 101285888 | elapsed time per iteration (ms): 107124.1 | learning rate: 3.473E-05 | global batch size: 2048 | lm loss: 5.340025E+00 | loss scale: 8192.0 | grad norm: 9664.223 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 637/ 292968 | consumed samples: 1304576 | consumed tokens: 101482496 | elapsed time per iteration (ms): 107183.5 | learning rate: 3.479E-05 | global batch size: 2048 | lm loss: 5.298586E+00 | loss scale: 8192.0 | grad norm: 11783.685 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 638/ 292968 | consumed samples: 1306624 | consumed tokens: 101679104 | elapsed time per iteration (ms): 107166.1 | learning rate: 3.484E-05 | global batch size: 2048 | lm loss: 5.315363E+00 | loss scale: 8192.0 | grad norm: 10217.252 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 639/ 292968 | consumed samples: 1308672 | consumed tokens: 101875712 | elapsed time per iteration (ms): 107360.8 | learning rate: 3.490E-05 | global batch size: 2048 | lm loss: 5.312271E+00 | loss scale: 8192.0 | grad norm: 10486.233 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 640/ 292968 | consumed samples: 1310720 | consumed tokens: 102072320 | elapsed time per iteration (ms): 108937.9 | learning rate: 3.495E-05 | global batch size: 2048 | lm loss: 5.286817E+00 | loss scale: 8192.0 | grad norm: 9778.188 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 641/ 292968 | consumed samples: 1312768 | consumed tokens: 102268928 | elapsed time per iteration (ms): 107300.5 | learning rate: 3.501E-05 | global batch size: 2048 | lm loss: 5.298764E+00 | loss scale: 8192.0 | grad norm: 9331.960 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -saving checkpoint at iteration 641 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-25 07:27:49,703] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/mp_rank_00_model_states.pt -[2021-10-25 07:27:49,848] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/mp_rank_01_model_states.pt -[2021-10-25 07:28:02,627] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-25 07:28:02,627] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-25 07:28:02,665] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-25 07:28:02,705] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-25 07:28:02,708] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-25 07:28:02,710] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-25 07:28:02,769] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-25 07:28:02,814] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-25 07:28:02,823] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-25 07:28:02,859] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-25 07:28:02,860] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-25 07:28:02,882] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-25 07:28:02,898] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-25 07:28:02,900] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-25 07:28:02,908] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-25 07:28:02,925] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-25 07:28:02,941] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-25 07:28:02,952] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-25 07:28:02,952] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-25 07:28:02,972] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-25 07:28:02,996] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-25 07:28:03,003] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-25 07:28:03,004] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-25 07:28:03,007] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-25 07:28:03,035] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-25 07:28:03,041] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-25 07:28:03,100] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-25 07:28:03,122] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-25 07:28:03,171] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-25 07:28:03,258] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-25 07:28:03,436] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-25 07:28:03,463] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-25 07:28:03,467] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-25 07:28:03,496] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-25 07:28:03,781] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-25 07:28:03,789] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-25 07:28:03,798] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-25 07:28:03,807] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-25 07:28:03,834] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-25 07:28:03,846] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-25 07:28:03,851] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-25 07:28:03,852] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-25 07:28:03,854] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-25 07:28:03,863] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-25 07:28:03,868] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-25 07:28:03,874] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-25 07:28:03,886] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-25 07:28:03,927] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-25 07:28:03,930] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-25 07:28:03,962] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-25 07:28:03,969] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-25 07:28:04,021] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-25 07:28:04,025] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-25 07:28:04,025] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-25 07:28:04,036] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-25 07:28:04,039] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-25 07:28:04,042] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-25 07:28:04,061] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-25 07:28:04,065] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-25 07:28:04,106] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-25 07:28:04,106] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-25 07:28:04,137] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-25 07:28:04,154] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-25 07:28:04,162] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-25 07:28:04,163] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-25 07:28:04,171] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-25 07:28:04,171] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-25 07:28:04,187] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-25 07:28:04,200] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-25 07:28:04,203] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-25 07:28:04,204] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-25 07:28:04,209] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-25 07:28:04,220] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-25 07:28:04,223] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-25 07:28:04,228] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-25 07:28:04,252] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-25 07:28:04,280] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-25 07:28:04,315] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-25 07:28:04,326] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-25 07:28:04,327] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-25 07:28:04,337] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-25 07:28:04,372] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-25 07:28:04,385] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-25 07:28:04,393] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-25 07:28:04,417] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-25 07:28:04,437] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-25 07:28:04,438] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-25 07:28:04,451] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-25 07:28:04,455] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-25 07:28:04,456] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-25 07:28:04,457] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-25 07:28:04,500] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-25 07:28:04,603] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-25 07:28:04,616] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-25 07:28:04,640] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-25 07:28:04,701] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-25 07:28:04,727] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-25 07:28:04,729] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-25 07:28:04,743] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-25 07:28:04,766] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-25 07:28:04,800] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-25 07:28:04,806] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-25 07:28:04,815] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-25 07:28:04,833] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-25 07:28:04,870] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-25 07:28:04,875] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-25 07:28:04,900] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-25 07:28:04,913] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-25 07:28:04,995] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-25 07:28:05,009] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-25 07:28:05,072] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-25 07:28:05,075] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-25 07:28:05,078] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-25 07:28:05,165] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-25 07:28:05,173] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-25 07:28:05,416] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-25 07:28:05,576] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-25 07:28:05,661] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-25 07:28:05,669] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-25 07:28:05,749] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-25 07:28:06,715] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-25 07:28:07,156] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-25 07:28:11,593] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-25 07:28:11,789] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-25 07:28:12,677] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-25 07:28:13,056] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-25 07:28:16,023] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-25 07:28:16,027] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step641/zero_pp_rank_0_mp_rank_01_optim_states.pt - successfully saved checkpoint at iteration 641 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 29241.51 -[exiting program after 1191.7797291556994 minutes] datetime: 2021-10-25 07:28:16 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... [OKAY]..................[OKAY] -[OKAY] --------------------------------------------------- -[OKAY] ----------------------------------------------------------------------------------------------------- - -op name - --------------------------------------------------op nameop name................ - ................installedop name installed.................................. installedcompatible.. installed - .. compatible-------------------------------------------------- - -.. --------------------------------------------------compatible - -compatible-------------------------------------------------- -cpu_adam-------------------------------------------------- - ............... - cpu_adam[NO] ...................... [NO][OKAY] -cpu_adam....... [OKAY]cpu_adam -............... ............... [NO] fused_adam....... [NO] ............. fused_adam[OKAY] ....... -[NO]............. .......[NO][OKAY] [OKAY] -....... - [OKAY] -fused_lamb fused_adam.............fused_lamb .............[NO]............. .......[NO]fused_adam[NO] [OKAY].............. - .............[OKAY][OKAY] -[NO] - .......fused_lamb .............[OKAY] -sparse_attn[NO] ...................fused_lamb sparse_attn [NO] [OKAY]............ ............. -....... [NO][NO][OKAY] -.............. [OKAY][OKAY]transformer - ............ -[NO]transformer ................... [OKAY]sparse_attn[NO] - ................... stochastic_transformer[OKAY][NO] - sparse_attn........ stochastic_transformer[OKAY] [NO] -............ ........ transformer[OKAY] [NO] - ....... [NO]............[OKAY] -....... [NO][OKAY] ....... -[OKAY] -transformer ............ stochastic_transformer[NO] ....... [OKAY]. - [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - -[OKAY] --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -op nameop nameop name op name ................................................ installed ................installed installed .. .. installed ..compatible -compatiblecompatible.. - ---------------------------------------------------compatible --------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -cpu_adam ...............cpu_adamcpu_adamcpu_adam [NO].............................. ............... .......[NO] [NO] [NO] [OKAY]....... ....... -....... [OKAY][OKAY][OKAY] - - -fused_adam .............fused_adamfused_adamfused_adam [NO]....................................... ....... [NO] [NO] [NO] .......[OKAY] ....... - .......[OKAY][OKAY] fused_lamb - -[OKAY] ............. - fused_lambfused_lamb[NO] fused_lamb ................................. ............. [OKAY][NO] [NO] -[NO] ..................... [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO] ....... sparse_attnsparse_attn[OKAY]sparse_attn - .................................... transformer[NO] [NO] [NO]............ ....... ....... .......[OKAY][NO] - [OKAY][OKAY]....... -transformer - [OKAY]............ - transformer[NO]transformer ............................... stochastic_transformer [NO] [NO][OKAY]........ - [NO].......[OKAY] ....... -stochastic_transformer[OKAY] - [OKAY]stochastic_transformer -. stochastic_transformer .[NO] .[NO]....... [NO].......[OKAY] -.......[OKAY] -[OKAY] -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name -op name op name ................ ................ ................................installedinstalled installed ..installed .. ..compatible..compatible - -compatiblecompatible ----------------------------------------------------------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam ...............cpu_adam...............cpu_adam [NO] [NO]............... ...................... ....... [NO] [OKAY][NO] [OKAY] - ....... -....... [OKAY][OKAY] - -fused_adamfused_adam ..........................fused_adam fused_adam [NO] [NO]............. ............. ....... ....... [NO] [NO][OKAY] [OKAY] - -.............. [OKAY][OKAY]fused_lamb - -fused_lamb fused_lamb.......................... fused_lamb [NO][NO]............. ....... ............. [OKAY]....... [NO] -[NO] [OKAY]....... -....... [OKAY][OKAY] - -sparse_attn ............sparse_attn [NO]sparse_attn............sparse_attn ............ .......[NO] ............ [NO][OKAY] ....... -[NO]....... transformer [OKAY]....... [OKAY] - ............ -[OKAY] transformer -[NO]transformer transformer................... ............[NO]............[OKAY] -.......[NO][NO] stochastic_transformer[OKAY].............. - .[OKAY][OKAY] stochastic_transformer - -[NO] ........ stochastic_transformer[NO] stochastic_transformer [OKAY] -........ . [OKAY] -[NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference utils.. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -utils quantizer.................. ..............[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer .............. [NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report------------------------------------------------------------------------------------------------------------------------------------------------------ - - - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninja -JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop name op nameop name................ ................................................installed installedinstalled.. installed..compatible.. - compatible..--------------------------------------------------compatible - - ---------------------------------------------------compatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam ......................cpu_adam cpu_adam[OKAY] [NO] -.............................. .......[NO][NO] [OKAY]....... -....... [OKAY][OKAY] -fused_adam - ............. [NO] ....... [OKAY]fused_adam - ............. [NO]fused_adamfused_lamb .......fused_adam.......................... [OKAY].............[NO][NO] - [NO].............. fused_lamb [OKAY] [OKAY]....... -............. - [OKAY][NO] - ....... fused_lamb[OKAY]fused_lamb -............. .............sparse_attn [NO] [NO] ............ ....... ....... [NO] [OKAY] [OKAY] -.......sparse_attn - ............[OKAY] -[NO] ....... transformer[OKAY] -............ [NO]transformer .......sparse_attnsparse_attn............ [OKAY][NO]........................ - .......[NO][NO] stochastic_transformer[OKAY] -.............. .[OKAY][OKAY] stochastic_transformer - -[NO] transformertransformer....... . ........................[OKAY] -[NO][NO][NO] .............. ....... [OKAY] -[OKAY][OKAY] - -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference transformer_inference.. ..[NO] .......[NO] [OKAY]....... - [OKAY] -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - .......-------------------------------------------------- -[OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja -JIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................................... [OKAY] [OKAY] -[OKAY][OKAY] --------------------------------------------------- - - --------------------------------------------------- ---------------------------------------------------op name--------------------------------------------------op name - - op name................op name ................................ installed ................installed installed .. ..installed .. compatible.. compatible -compatible -compatible-------------------------------------------------- - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -cpu_adam ............... cpu_adamcpu_adamcpu_adam[NO] ............... ............... ...................... [NO] [OKAY][NO] [NO] - ....... ....... .......[OKAY] - [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY]fused_adam -fused_adamfused_adam fused_lamb.......................... ............. ............. [NO] [NO][NO] [NO] ....... .............. [OKAY]....... [OKAY][OKAY] - -[OKAY] - -fused_lambfused_lamb fused_lamb.......................... .............[NO][NO] ..............[NO]sparse_attn [OKAY][OKAY] - ....... -............ [OKAY][NO] - ....... [OKAY] -transformer ............sparse_attn [NO]sparse_attn ............ ....... ............[NO]sparse_attn [OKAY] -....... [NO] [OKAY]............stochastic_transformer....... - [NO][OKAY]. transformer -[NO] ................... transformer....... [NO] ............[OKAY] [OKAY] -[NO] -....... .......transformer[OKAY] -[OKAY]............ - [NO] stochastic_transformer....... stochastic_transformer.[OKAY] -[NO]. .......[NO] [OKAY]stochastic_transformer - ....... .[OKAY] -[NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] ....... [NO] -transformer_inferenceasync_io .. [NO]............... .......[NO] [OKAY]....... - [NO] -utils .................. [NO] ....... [OKAY] -transformer_inference quantizer.. ..............[NO] [NO]....... .......[OKAY] -[OKAY] -utils ..................-------------------------------------------------- -[NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. ....... - [NO] -async_io transformer_inference............... ..[NO] [NO] .............. [NO][OKAY] - -utils .................. [NO] ....... [OKAY] -transformer_inference .. quantizer[NO] ..................... [NO] [OKAY]....... - [OKAY] -utils-------------------------------------------------- -.................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - ...................deepspeed info 0.5.5+29bee73, 29bee73, master................... - 0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - ......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - --------------------------------------------------- - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - -op name................ op nameop name ................ installed................................ installed installed.. .. installed compatible ..compatible -.. --------------------------------------------------- --------------------------------------------------compatible -compatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. cpu_adamcpu_adam[NO][NO] ..................................... ....... [OKAY] [NO][NO] -[OKAY] -.............. [OKAY][OKAY] - -fused_adam ............. fused_adam[NO] .................... fused_adam[NO]fused_adam[OKAY] -................................. fused_lamb [OKAY] [NO]............. - [NO][NO].......fused_lamb .............. .............[OKAY] [OKAY] -[OKAY] -[NO] - .......fused_lamb [OKAY]fused_lamb............. - .............[NO] .......[NO] [OKAY]sparse_attn - ................... [NO] .......sparse_attn [OKAY] [OKAY] -............ - [NO] transformer.......sparse_attn ............[OKAY]............ - [NO] [NO]transformersparse_attn....... ...............................[OKAY] -[NO][OKAY][NO] -.......stochastic_transformer....... transformer[OKAY]. -[OKAY] [NO] -stochastic_transformer............ .......transformer. [OKAY][NO][NO]............ - ..............[NO] [OKAY].......[OKAY] - -[OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - - op nameop name................op name ................installed ................ ................ ..installedinstalledinstalled .. .. compatible..compatible - ---------------------------------------------------compatiblecompatible --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... cpu_adamcpu_adam[NO]cpu_adam ...................... .............................. [NO] [OKAY] -[NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_adamfused_adamfused_lambfused_adam ............. .......................... [NO].............[NO] [NO]..............[NO] .......[OKAY][OKAY]....... - - [OKAY][OKAY] - -fused_lamb fused_lamb.............fused_lamb [NO]............. .................... sparse_attn[NO][OKAY][NO] ............ -.............. [NO][OKAY][OKAY] - -....... [OKAY] -transformersparse_attn ............ ............[NO] [NO]sparse_attnsparse_attn....... ...............................[OKAY] -[NO] [OKAY] [NO] -.......stochastic_transformer .......transformer[OKAY] . - [OKAY] ............transformer[NO] - ............[NO]....... transformer[NO] .......[OKAY]................... - [OKAY] [NO][OKAY] - -....... [OKAY]stochastic_transformer -stochastic_transformer .stochastic_transformer. [NO][NO]. ..............[NO] [OKAY] [OKAY] -....... - [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... ...............[NO] .......[NO] [NO]....... - [NO] -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -utils .................. utils[NO] ......................... [NO][OKAY] -....... [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY]-------------------------------------------------- - --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info:['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version .................... torch install path1.8.1 - ...............torch cuda version ............... 11.1 -nvcc version .....................['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -11.2 -deepspeed install pathtorch version ............................... 1.8.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -torch cuda versiondeepspeed info .................................. 11.10.5.5+29bee73, 29bee73, master - -nvcc versiondeepspeed wheel compiled w. ........................... 11.2torch 1.8, cuda 11.1 - -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY][OKAY] - - ---------------------------------------------------[OKAY] ----------------------------------------------------------------------------------------------------- - - -op name op name--------------------------------------------------op name................ -................installed................ op name installed..installed compatible.................... - compatible--------------------------------------------------compatible - -installed-------------------------------------------------- - ---------------------------------------------------.. - compatible -cpu_adam-------------------------------------------------- -............... cpu_adam[NO] cpu_adam ............... ....... [NO]cpu_adam[OKAY] ............... -......................[NO] [OKAY] ....... -[NO] [OKAY]....... - fused_adam[OKAY] -............. [NO] ....... fused_adam[OKAY] -.............fused_adam fused_lamb[NO]............. .......[NO]fused_adam............. .......[OKAY][NO]............. - [OKAY].......[NO] - fused_lamb[OKAY] ....... -.............fused_lamb [OKAY][NO]............. - .......[NO] [OKAY] fused_lamb -....... .............[OKAY] -[NO]sparse_attn ................... [NO][OKAY] ....... - sparse_attn[OKAY] -............sparse_attn transformer [NO] ............ ................... [NO][OKAY][NO]sparse_attn - ..........................transformer [OKAY] ............ -[NO][OKAY] -[NO].......stochastic_transformer transformer.......[OKAY] . -[NO]............[OKAY] ....... -[NO]transformer [OKAY].......stochastic_transformer............ - [OKAY][NO]. - .......[NO] stochastic_transformer.......[OKAY] -[OKAY] -. [NO]stochastic_transformer ....... .[OKAY] -[NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO]transformer_inference - .. [NO] ....... [OKAY] -utils .................. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -quantizer .............. utils[NO] ......................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] -[OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop name op nameop name ................ ................ ................................ installed installed installedinstalled ........ compatiblecompatiblecompatiblecompatible - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -cpu_adamcpu_adamcpu_adam cpu_adam ............... ............................................. [NO][NO][NO][NO] ..................... ....... [OKAY] [OKAY] -[OKAY][OKAY] - - -fused_adam ............. fused_adamfused_adamfused_adam [NO] ....................................... ....... [NO] [NO] [NO].......[OKAY] - ....... .......[OKAY] [OKAY]fused_lamb -[OKAY] - -............. [NO] fused_lambfused_lamb fused_lamb....... ............. .............[NO].............[OKAY] ....... -[NO] [NO][OKAY]....... - .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY]sparse_attn - sparse_attn............transformer sparse_attn ............ [NO] ........................ [NO].......[NO][NO] [OKAY]....... -.............. [OKAY]transformer[OKAY][OKAY] - - -............transformertransformer [NO] stochastic_transformer............ ............ . .......[NO] [NO] [NO][OKAY]....... - .......[OKAY]....... [OKAY] -stochastic_transformer -[OKAY] -stochastic_transformerstochastic_transformer . .[NO]. [NO].......[NO] .......[OKAY] -.......[OKAY] -[OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - -[OKAY]-------------------------------------------------- --------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op nameop name - - ................................op name op name installed ................................installed installed....installed ..compatiblecompatible.. - - --------------------------------------------------compatible --------------------------------------------------- -compatible - --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... cpu_adam[NO] cpu_adam...............cpu_adam....... [NO]..............................[OKAY] -.......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -fused_adam ............. [NO] .......fused_adam [OKAY]............. - fused_adamfused_adam[NO] fused_lamb .............................................. [OKAY][NO][NO] -[NO] ..................... [OKAY]fused_lamb[OKAY][OKAY] - -............. - [NO] .......fused_lambfused_lamb [OKAY].......................... - [NO][NO]sparse_attn .......................... [OKAY][NO][OKAY] - -.......sparse_attn [OKAY]............ - [NO]transformer ................... [OKAY][NO] - .......sparse_attn transformersparse_attn[OKAY] -.................................... [NO][NO][NO] stochastic_transformer ..................... .[OKAY][OKAY][OKAY] - - -[NO] .......transformer transformerstochastic_transformer[OKAY] - ......................... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -stochastic_transformer stochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] -utils .................. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] -quantizer .............. --------------------------------------------------[NO] -....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................................... [OKAY] -[OKAY][OKAY][OKAY]-------------------------------------------------- - - - ---------------------------------------------------op name---------------------------------------------------------------------------------------------------- - - -................op name op nameinstalled................op name ................installed.................. ..installedcompatible installed - compatible --------------------------------------------------.. -.. - --------------------------------------------------compatiblecompatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... ...............[NO] [NO]....... cpu_adam[OKAY].......cpu_adam - [OKAY].............................. - [NO][NO] .............. [OKAY][OKAY] - -fused_adam fused_adam............. .............[NO] [NO]....... .......[OKAY]fused_adam fused_adam - [OKAY] ............. -fused_lamb ............. [NO] .............fused_lamb[NO] [NO].................... ....... [NO][OKAY] ....... - [OKAY] ....... - [OKAY]fused_lamb[OKAY] - - fused_lamb............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY]sparse_attn -sparse_attn - ........................transformertransformer ........................[NO] [NO] [NO] [NO]....... .............. ....... [OKAY][OKAY][OKAY] - -[OKAY] - -transformertransformerstochastic_transformer ............stochastic_transformer............. [NO] [NO]. .......[NO].......[NO] [OKAY] .......[OKAY]....... - - [OKAY][OKAY] - -stochastic_transformer stochastic_transformer. [NO] ........ [NO][OKAY] -....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. [OKAY]..................[OKAY][OKAY] - - ---------------------------------------------------[OKAY] ----------------------------------------------------------------------------------------------------- -op name - - op name................op name -------------------------------------------------- ................installed................ -..installed op nameinstalled..compatible - --------------------------------------------------compatible.................. - - installedcompatible-------------------------------------------------- - -..-------------------------------------------------- -compatible -cpu_adam-------------------------------------------------- -cpu_adam cpu_adam.............................. [NO][NO]............... cpu_adam ....... .......[NO]............... [OKAY]....... - [OKAY] -[NO][OKAY] -....... [OKAY] -fused_adam ............. fused_adamfused_adam[NO] fused_adam.................... .............[NO].............[OKAY] -....... [NO][NO][OKAY] fused_lamb -........................... [NO]fused_lamb[OKAY][OKAY] - -.................... [OKAY]fused_lambfused_lamb[NO] - ................................. [NO][OKAY] -[NO]....... .......[OKAY] -sparse_attn [OKAY]............ - [NO]sparse_attn ....... ............[OKAY] -[NO] .......transformersparse_attn ............ [OKAY]............[NO] - sparse_attn....... [NO]transformer............[OKAY] -................... [NO]stochastic_transformer[OKAY] [NO]....... - . ....... transformer[NO] [OKAY] [OKAY] -................... - transformer [OKAY] -stochastic_transformer ............[NO] . [NO] .......[NO] [OKAY].............. - [OKAY][OKAY]stochastic_transformer - -. [NO]stochastic_transformer ........ [OKAY][NO] - ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inferenceutils .................... [NO][NO] .............. [OKAY][OKAY] - -quantizerutils ................................ [NO][NO] .............. [OKAY][OKAY] - ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** ------------------------------------------------------------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja --------------------------------------------------- --------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name -op name ................op name ................ installed................ ................ installedinstalled.. compatibleinstalled.... - compatible-------------------------------------------------- -..compatible-------------------------------------------------- - - -compatible ----------------------------------------------------------------------------------------------------- - -cpu_adam ...............cpu_adam [NO]cpu_adam............... cpu_adam ....... ...............[NO] ............... [OKAY] ....... - [NO] [NO] [OKAY] -.............. [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_adam fused_adamfused_adam............. fused_lamb .......................... [NO] .............[NO] [NO] .............. [NO] [OKAY].............. -[OKAY] -[OKAY][OKAY]fused_lamb - - .............fused_lamb fused_lamb [NO] ................................. [NO][OKAY] [NO]sparse_attn -....... ....... ............[OKAY] -[OKAY][NO] - ....... [OKAY] -transformer sparse_attn............ ............sparse_attn [NO][NO] sparse_attn ................... ....... ............ [NO] [OKAY][OKAY] -[NO] -....... .......transformerstochastic_transformer[OKAY] - [OKAY]............ -transformer.[NO] transformer............[NO]....... ...................[NO][OKAY] -[OKAY][NO]....... - stochastic_transformer.......[OKAY] -[OKAY]. - [NO]stochastic_transformer stochastic_transformer ....... [OKAY].. - [NO][NO] .............. [OKAY][OKAY] - -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja ---------------------------------------------------JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... ..................[OKAY][OKAY].................. - - [OKAY][OKAY] ----------------------------------------------------------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op name -op name - op name op name ................................................................ installedinstalledinstalledinstalled .. .... .. compatible compatible -compatiblecompatible --------------------------------------------------- - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam cpu_adam............... cpu_adam...............cpu_adam[NO] ...............[NO]...................... [NO].......[OKAY] [NO] -....... [OKAY] ....... -[OKAY] -[OKAY] -fused_adam ............. fused_adam[NO] .................... fused_adamfused_adam[NO][OKAY] - ................................. fused_lamb [NO][OKAY][NO] -....... .................... [NO][OKAY]fused_lamb[OKAY] - -.................... [OKAY][NO]fused_lamb - fused_lamb.................... .............[OKAY][NO] - [NO]....... ....... [OKAY][OKAY]sparse_attn - - ............ sparse_attn[NO] ................... [NO][OKAY] -....... [OKAY] -transformersparse_attnsparse_attntransformer ................................................ [NO] [NO] [NO][NO] ....... ....... ..............[OKAY][OKAY] - -[OKAY][OKAY] -transformer -transformer ............ stochastic_transformer............[NO]stochastic_transformer [NO]........ . .......[NO] [OKAY] [NO] -[OKAY] ....... -....... [OKAY][OKAY] - -stochastic_transformer . [NO] ....... stochastic_transformer[OKAY] - . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................................... [OKAY] [OKAY] -[OKAY][OKAY] - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op name -op name op nameop name ................................................................ installedinstalled installed ....installed.. compatible compatiblecompatible - -.. ------------------------------------------------------------------------------------------------------------------------------------------------------- - -compatible - --------------------------------------------------- -cpu_adamcpu_adamcpu_adam ..............................cpu_adam............... [NO][NO] .............................[NO] [OKAY][OKAY].......[NO] - - [OKAY]....... - [OKAY] -fused_adam fused_adam............. .............fused_adam[NO] [NO] fused_adam............. .................... [OKAY].......[NO] - [NO][OKAY] ....... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -fused_lamb....... [OKAY].............[OKAY]fused_lamb - - [NO]............. fused_lamb .......fused_lamb [NO] ............. [OKAY].................... - [OKAY][NO][NO] - .............. [OKAY][OKAY] - -sparse_attn ............ [NO]sparse_attn ................... [OKAY] sparse_attnsparse_attn - [NO] ............transformer............ ....... [NO]............[NO][OKAY] -.......[NO]....... transformer [OKAY].......[OKAY] - -............[OKAY] -[NO]transformertransformer stochastic_transformer....... ............ ............[OKAY][NO] . - [NO].......[NO] stochastic_transformer ....... ....... [OKAY] . - [OKAY][NO][OKAY] - -stochastic_transformer....... [OKAY]stochastic_transformer. - [NO] ........ [NO][OKAY] -....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yumasync_io - ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY] -[OKAY][OKAY] - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................................................................ installedinstalled installed installed .. .. .. .. compatible compatiblecompatible - -compatible ----------------------------------------------------------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam cpu_adamcpu_adam ............... ............... .............................. [NO][NO][NO][NO] ............................ [OKAY][OKAY] [OKAY] -[OKAY] - - -fused_adamfused_adamfused_adam fused_adam ............. .......................... [NO] ............. [NO] [NO].......[NO]....... .......[OKAY][OKAY]....... - - [OKAY]fused_lamb[OKAY] -fused_lamb............. - .............[NO]fused_lamb fused_lamb [NO].................... [OKAY] .................... - [NO][NO][OKAY] -.............. [OKAY][OKAY] - -sparse_attn ............ [NO]sparse_attn ....... ............[OKAY] -[NO]sparse_attnsparse_attn transformer................... ............ ............ [OKAY][NO] - [NO] [NO] ....... transformer....... ....... [OKAY][OKAY][OKAY]............ - - - [NO]transformer transformerstochastic_transformer ....... ............[OKAY]............ . - [NO][NO] [NO] stochastic_transformer....... ....... .......[OKAY][OKAY]. - -[NO][OKAY] -....... stochastic_transformer[OKAY] - stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ...................DeepSpeed general environment info: 0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[NO] [OKAY]....... - [OKAY] -utilsquantizer ................................ [NO][NO] .............. [OKAY][OKAY] - -quantizer-------------------------------------------------- -.............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja--------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -op nameop name op name ................op name ................ installed................ ................ installed .. installedinstalled .. compatible .. -..compatible--------------------------------------------------compatible - - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... cpu_adam[NO]cpu_adamcpu_adam .................................................... [NO] [OKAY] - [NO].......[NO] ....... [OKAY] ....... -[OKAY] -[OKAY] -fused_adam ............. [NO] ....... [OKAY]fused_adam - fused_adamfused_adamfused_lamb............. .......................................[NO] [NO][NO][NO]....... ....... [OKAY] .............. -[OKAY] -[OKAY][OKAY] - -fused_lamb .............fused_lambfused_lamb [NO].......................... .......[NO][NO] [OKAY] .......sparse_attn -....... ............[OKAY][OKAY] - -[NO] ....... [OKAY] -transformer sparse_attn............ ............sparse_attnsparse_attn[NO] [NO]............................... .......[NO] [NO][OKAY] - [OKAY]....... -....... stochastic_transformer [OKAY] transformer[OKAY] - -.............transformer [NO]transformer ............ [NO]....... ............[NO][OKAY]....... - .......[NO][OKAY] -.......[OKAY]stochastic_transformer -[OKAY] -. [NO]stochastic_transformer stochastic_transformer....... [OKAY].. - [NO][NO] .............. [OKAY][OKAY] - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'].................... - 1.8.1 -torch version torch cuda version.................... ...............1.8.1 -11.1 -torch cuda versionnvcc version .................................... 11.111.2 - -nvcc versiondeepspeed install path ................................ 11.2 -deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']........... - deepspeed info ................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']0.5.5+29bee73, 29bee73, master - -deepspeed infodeepspeed wheel compiled w. ......................... 0.5.5+29bee73, 29bee73, mastertorch 1.8, cuda 11.1 - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... transformer_inference[NO] .. .......[NO] .......[NO] -[OKAY] -utils .................. [NO] ....... [OKAY] -transformer_inferencequantizer ................ [NO] [NO]....... .......[OKAY] - [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -utils .................. [NO] ....... [OKAY] -async_io quantizer............... ..............[NO] [NO]....... .......[NO] -[OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ...............torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................torch version 1.8.1.................... - 1.8.1 -torch cuda version ...............torch cuda version 11.1............... - 11.1nvcc version - .....................nvcc version 11.2..................... - 11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1674500.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 150 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.006 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 0.0001 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 3750000 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_bnb_optimizer ............................... False - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info:['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version .................... 1.8.1 -torch install pathtorch cuda version .............................. 11.1 -nvcc version ..................... 11.2['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -deepspeed install path ........... torch version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'].................... - 1.8.1deepspeed info - ................... torch cuda version0.5.5+29bee73, 29bee73, master -...............deepspeed wheel compiled w. 11.1...... - torch 1.8, cuda 11.1nvcc version - ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. [OKAY].................. .................. -[OKAY] [OKAY]-------------------------------------------------- -[OKAY] - - -----------------------------------------------------------------------------------------------------op name - - --------------------------------------------------op name................ -op name installed................op name................ ..installed................installed ..compatible.. -installed compatible-------------------------------------------------- compatible -.. - - ----------------------------------------------------------------------------------------------------compatible - - --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam ...............[NO]...............cpu_adam .......[NO]...............[NO] [OKAY].......[NO] ....... - ....... [OKAY] [OKAY] -[OKAY] - -fused_adam ............. [NO] .......fused_adam fused_adam [OKAY] fused_adam............. -............. .............[NO]fused_lamb[NO] ....................[NO] .......[OKAY] [NO]....... -[OKAY] -.......[OKAY] -fused_lamb[OKAY] fused_lamb -............. fused_lamb ............. [NO] ............. .......[NO] [NO][OKAY]....... - .......[OKAY] -[OKAY]sparse_attn - ............ [NO] ....... [OKAY] -transformersparse_attn ............sparse_attn............ ............[NO]sparse_attn[NO] ............[NO].............. .......[OKAY][NO] [OKAY] -[OKAY] - -....... transformer[OKAY]transformer -stochastic_transformer............ ............transformer[NO] .[NO]....... ............ [NO].......[OKAY] -.......[NO][OKAY] -[OKAY]....... - stochastic_transformer [OKAY]stochastic_transformer - . [NO]. ....... stochastic_transformer [NO][OKAY] -. .......[NO] [OKAY]....... - [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name -op name op name ................op name................ installed ................................ installed ..installed.. installed compatiblecompatible.... - - --------------------------------------------------compatible ---------------------------------------------------compatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam .......cpu_adam...............cpu_adam [OKAY]...............[NO]............... - [NO].......[NO] .......[OKAY]....... - [OKAY][OKAY] - -fused_adam ............. [NO] ....... fused_adam[OKAY]fused_adamfused_adam - ....................................... [NO][NO][NO] fused_lamb ..................... ............. [OKAY] [OKAY][OKAY] -[NO] - - ....... fused_lamb[OKAY]fused_lamb -fused_lamb ....................................... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO] ....... [OKAY] -transformersparse_attn sparse_attnsparse_attn........................ ............ [NO]............ [NO][NO] ....... ..............[NO] [OKAY] [OKAY] -....... -[OKAY] -[OKAY] -transformerstochastic_transformertransformer transformer ............. ........................ [NO][NO][NO][NO] ..................... ....... [OKAY][OKAY] - -[OKAY][OKAY] - -stochastic_transformer stochastic_transformer.stochastic_transformer [NO] .. ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] -async_iotransformer_inference ................. [NO][NO] .............. [NO][OKAY] - -utils .................. [NO] ....... [OKAY] -transformer_inference .. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY]utils - .................. [NO] .......-------------------------------------------------- -[OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - - -async_ioasync_ioasync_io ............................................. [NO][NO][NO] ..................... [NO][NO][NO] - - -transformer_inferencetransformer_inferencetransformer_inference ...... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -utils utils..................utils .................................... [NO] [NO] [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - -quantizerquantizerquantizer .......................................... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------JIT compiled ops requires ninja - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] -[OKAY] - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................................................................ installedinstalledinstalled installed .. ...... compatible - compatiblecompatible--------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... cpu_adamcpu_adam[NO]cpu_adam ............................................. [NO][NO][NO] ..................... ....... [OKAY] [OKAY][OKAY] - -[OKAY] - -fused_adamfused_adam fused_adam ............. ............. .............fused_adam [NO][NO] ..............[NO] ............. [OKAY][OKAY] - - .......[NO]fused_lambfused_lamb .............[OKAY]............. ....... -[NO][NO] [OKAY] ....... -.......fused_lamb [OKAY] fused_lamb[OKAY] -............. - [NO] .................... [NO] [OKAY]....... -sparse_attn sparse_attn [OKAY] ........................ [NO][NO] -..............sparse_attn [OKAY][OKAY]............ - - [NO] transformer.......transformer ............[OKAY] -............[NO] .......transformer[NO] sparse_attn .......[OKAY] ............ -[OKAY]............ [NO] - stochastic_transformer.......stochastic_transformer [NO][OKAY] ........ . -[OKAY] [NO][NO]stochastic_transformer -.............. .[OKAY] transformer - [OKAY][NO] - ....... ............[OKAY] - [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - - --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - -op nameop name................op name ................................ ................installed installed installed..installed.. compatible..compatible.. --------------------------------------------------- - ---------------------------------------------------compatiblecompatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adamcpu_adamcpu_adam .............................. ............... ............... [NO][NO] [NO] .............. [OKAY] ....... -[OKAY] -[NO][OKAY] - ....... [OKAY] -fused_adam ............. [NO]fused_adamfused_adam .................... ............. [OKAY] [NO] -[NO] ..............fused_lamb fused_adam[OKAY]............. [OKAY] - -[NO] ....... fused_lamb[OKAY]fused_lamb - ....................................... [NO] [NO] [NO] .............. .......[OKAY] [OKAY][OKAY]sparse_attn - - ............ -[NO] ....... [OKAY] -fused_lamb transformer sparse_attn............ .............sparse_attn ............ [NO] ............ [NO][NO] .......[NO]....... [OKAY].......[OKAY] - ....... -[OKAY] -stochastic_transformertransformer [OKAY]transformer............. ............ [NO] -[NO] .......[NO]....... .......[OKAY][OKAY] - -[OKAY] -stochastic_transformer stochastic_transformer. [NO]. [NO] sparse_attn....... .......[OKAY] [OKAY] -............ - [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. ......................................................[OKAY] -[OKAY][OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name - op nameop nameop name................ ................................................installed installed.. installedinstalled .. compatible .... -compatible ---------------------------------------------------compatiblecompatible-------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ...............cpu_adam [NO]cpu_adam............... ...................... [NO][NO][OKAY] -cpu_adam.............. ...............[OKAY][OKAY] - -[NO] .......fused_adam ............. [NO][OKAY] -fused_adam.......fused_adam .............[OKAY]............. -[NO] [NO].......fused_lamb .......[OKAY]............. - [OKAY][NO] - ....... [OKAY]fused_lamb - fused_lamb............. .............[NO]fused_adam [NO].................... .......[NO][OKAY] -[OKAY] - ....... sparse_attn[OKAY] -............ [NO] ....... [OKAY] -sparse_attn ............sparse_attn fused_lambtransformer[NO]............ ............[NO]....... [NO]....................[OKAY] -.......[OKAY][NO] transformer -[OKAY] -............ transformer.......[NO] stochastic_transformer ............ [OKAY] ........[NO] -[NO][OKAY]....... - .......[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY]sparse_attn - ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[NO] [OKAY]....... - [OKAY] -quantizerutils ................................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed infoDeepSpeed general environment info: ................... 0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1torch install path - ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninja - - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja .................. .................. ....................................[OKAY] - [OKAY][OKAY][OKAY]-------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- -op name-------------------------------------------------- - op name - ................op name................ op name installed ................installed................ ....installed installed ..compatiblecompatible - -..--------------------------------------------------compatible-------------------------------------------------- - - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam .............................. cpu_adam cpu_adam [NO][NO] ............... ............................. [OKAY][NO][OKAY][NO] - - .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] fused_adamfused_adam.............. ..........................[OKAY][OKAY] - -[NO][NO] fused_lamb....... ....... fused_lamb............. [OKAY][OKAY] - -[NO]............. .......[NO] fused_lamb.......[OKAY]fused_lamb - .............[OKAY]............. - [NO][NO] ....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY]sparse_attn - ............transformer sparse_attn [NO] ............ sparse_attn................... [NO][OKAY] ............ -[NO] transformer.......[NO] ....... ............ [OKAY]....... -[OKAY] [NO][OKAY] - -.......stochastic_transformer [OKAY]transformertransformer -. ............[NO]............ stochastic_transformer [NO].......[NO]. [OKAY]..............[NO] - [OKAY].......[OKAY] - -[OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja .................................... .................. .................. [OKAY][OKAY] [OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name op name ................................ ................ ................ installedinstalledinstalled ..installed.... compatiblecompatible.. -compatible - --------------------------------------------------compatible-------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam cpu_adam .............................. cpu_adam [NO][NO]............... ......................[NO] ....... [OKAY] [NO]....... - [OKAY].......[OKAY] - -[OKAY] -fused_adam ............. fused_adam[NO]fused_adam fused_adam .............................................. [OKAY][NO][NO][NO] - .............. ....... [OKAY] fused_lamb -[OKAY][OKAY] - -.............fused_lamb fused_lamb[NO] fused_lamb ............. .................... ............. [NO][OKAY] [NO] [NO] - ....... ..............[OKAY] -[OKAY][OKAY] - -sparse_attn ............ sparse_attn[NO]sparse_attn sparse_attn............................... [NO][NO][OKAY]............ - ..............[NO]transformer [OKAY][OKAY]................... - - [NO][OKAY] -transformertransformer ............................... transformer [OKAY][NO][NO] -............ ....... ....... [NO] [OKAY] stochastic_transformer -[OKAY]....... - [OKAY].stochastic_transformer - stochastic_transformer[NO] . stochastic_transformer ....... . [NO] [OKAY] .[NO] -....... ....... [OKAY][OKAY] - -[NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] -transformer_inferenceasync_io .. [NO]............... .......[NO] [OKAY]....... - [NO] -utils .................. [NO] ....... [OKAY] -quantizertransformer_inference ................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. ..................[OKAY] [OKAY] [OKAY] - - -[OKAY]-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -op nameop name - op name ................ op name................ ................installed ................ installedinstalled.. installed..compatible.. - ..compatible--------------------------------------------------compatible - -compatible --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [NO] - .......--------------------------------------------------cpu_adam cpu_adam - [OKAY] ............... -............... [NO][NO] .............. cpu_adam[OKAY]fused_adam[OKAY] - ............. [NO] -...................... [NO][OKAY] -fused_adam....... ............. [NO]fused_lamb fused_adam....... ............. [OKAY] [OKAY][NO]............. - .......[NO] - fused_lamb[OKAY]....... - ............. [OKAY][NO] - ....... [OKAY] -fused_lamb .............fused_adam sparse_attn[NO] ................... [NO][OKAY] -.............sparse_attn....... [NO]............[OKAY] - .......[NO] [OKAY]transformer....... sparse_attn............[OKAY] - -............[NO] [NO]transformerfused_lamb....... ............ [NO] ..............[OKAY]............. [OKAY] - [OKAY][NO] - -stochastic_transformer transformer.......stochastic_transformer . ............ [NO]. [NO][OKAY] ....... - [NO].......[OKAY] -.......[OKAY] -[OKAY] -stochastic_transformer . [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja .................. ...................................................... [OKAY] [OKAY] -[OKAY][OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name -op nameop name................ op name................installed................ installedinstalled.. .................. ..compatibleinstalledcompatible - -------------------------------------------------- - -compatible..-------------------------------------------------- - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adam ...............cpu_adam [NO]............... cpu_adam ....... [NO] cpu_adam............... [OKAY] ......................[NO] - .......[OKAY][NO] - [OKAY]....... - [OKAY] -fused_adam ............. [NO] ....... fused_adam[OKAY] -.............fused_adam [NO]fused_adam fused_lamb .............................................. [OKAY][NO][NO][NO] - ....... ..............fused_lamb[OKAY] -[OKAY].............[OKAY] - -fused_lamb[NO] .................... fused_lamb [OKAY] -[NO]............. .......[NO] [OKAY]....... - [OKAY]sparse_attn - ............ [NO] ....... [OKAY]sparse_attn - ............ transformer[NO]sparse_attn ................... ............sparse_attn[OKAY][NO] - [NO]...................transformer [NO].......[OKAY] ............ -.......[OKAY] -[NO][OKAY]stochastic_transformer - .......transformer .transformer [OKAY] - [NO]........................ .......stochastic_transformer[NO] [NO] ........[OKAY] ....... -[NO] [OKAY] [OKAY] -....... - [OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja -JIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... [OKAY]..................[OKAY][OKAY] - -[OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name - op name ................op name ................ ................ installed................ installed installed installed.. .. .. ..compatiblecompatiblecompatible - - -compatible------------------------------------------------------------------------------------------------------------------------------------------------------ - - - --------------------------------------------------- -cpu_adamcpu_adamcpu_adam cpu_adam ............... ............... .............................. [NO] [NO] [NO] [NO].............. .......[OKAY].......[OKAY] - - [OKAY][OKAY] - -fused_adam .............fused_adamfused_adamfused_adam .............[NO]............. [NO].............[NO] ....... ....... .......[NO][OKAY] [OKAY] -[OKAY]....... - - [OKAY]fused_lambfused_lamb -fused_lamb ............. ..........................fused_lamb [NO].............[NO] [NO] [NO]....... ....... .............. [OKAY] [OKAY] -[OKAY] -[OKAY] - -sparse_attnsparse_attn ........................ sparse_attnsparse_attn ............[NO][NO]............ [NO][NO] ....... ....... .............. [OKAY] [OKAY] -[OKAY] -[OKAY] -transformer - transformer............ transformer transformer............ [NO] [NO]................... ............ [NO][OKAY][NO]....... - ..............[OKAY] -stochastic_transformer[OKAY][OKAY] - -stochastic_transformer. [NO]stochastic_transformer . stochastic_transformer.......[NO]. [OKAY][NO] -....... . ....... [OKAY] [NO] - [OKAY]....... - [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. ..................[OKAY] .................. -[OKAY][OKAY] -------------------------------------------------- - -[OKAY] --------------------------------------------------- - ---------------------------------------------------op name--------------------------------------------------op name - - ................op name................ op name installed................................installed ..installedinstalled.. .. compatible ..compatiblecompatible - --------------------------------------------------- - ---------------------------------------------------compatible-------------------------------------------------- - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam ............... cpu_adam .............................. [NO][NO][NO]............... ....... .............. [NO] [OKAY] [OKAY][OKAY] - -....... - [OKAY] -fused_adamfused_adam .......................... fused_adam[NO]fused_adam[NO] ............. ............. ..............[NO] [OKAY] [NO].......[OKAY] - - .......[OKAY]fused_lambfused_lamb -.............[OKAY]............. -[NO] [NO]fused_lamb .......fused_lamb .................... [OKAY].............[NO][OKAY] - -.......[NO] [OKAY]....... - [OKAY] -sparse_attn sparse_attn............ ............sparse_attn[NO] [NO] sparse_attn................... [OKAY] ............ -.......[NO] transformer[NO][OKAY] -.......................... transformer [OKAY] -[OKAY][NO]............ - .......transformer[NO] transformer ....... [OKAY] ............ -............[OKAY] -[NO]stochastic_transformer[NO] stochastic_transformer ....... . ....... . [NO][OKAY] [OKAY]....... -[NO] - [OKAY]....... -stochastic_transformer stochastic_transformer [OKAY] -.. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] .......  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.[NO] - -async_io transformer_inference............... ..[NO] [NO]....... .......[NO] -[OKAY] -utils .................. [NO] ....... [OKAY]transformer_inference - .. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] -utils .................. --------------------------------------------------[NO] - ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -JIT compiled ops requires ninja - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] -[OKAY] - --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -op nameop name -op name op name................................................ ................installedinstalledinstalled installed .. ....compatible .. - compatiblecompatible-------------------------------------------------- -compatible - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -cpu_adam ............... cpu_adamcpu_adam[NO] cpu_adam ...................... ..............................[OKAY][NO] -[NO].......[NO] .......[OKAY]....... - [OKAY][OKAY] - -fused_adam ............. [NO] fused_adam....... fused_adam [OKAY]fused_adam ............. - ............. ............. [NO] fused_lamb[NO][NO] ....... .................... ....... [OKAY][OKAY] - -[NO][OKAY] fused_lamb -....... fused_lamb.............fused_lamb [NO].............[OKAY]............. .......[NO] - [OKAY].......[NO] - [OKAY]....... - [OKAY] -sparse_attn ............sparse_attn [NO]sparse_attn sparse_attn ....... .................................... [OKAY][NO][NO][NO] - ....... ....... transformer....... [OKAY] [OKAY] ............ - - [OKAY][NO]transformer - transformer...................transformer [OKAY] ........................ -[NO] [NO][NO]....... stochastic_transformer..............[OKAY] -[OKAY][OKAY]. - - stochastic_transformer[NO] stochastic_transformerstochastic_transformer ........ .. [OKAY][NO][NO][NO] ....... - ....... ....... [OKAY] [OKAY] -[OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - ............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -transformer_inference ..quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] ---------------------------------------------------utils - .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utilstransformer_inference .................... [NO][NO] .............. [OKAY][OKAY] - -quantizer .............. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -transformer_inference .. [NO] quantizer....... ..............[OKAY] -[NO] ....... [OKAY] -utils .................. [NO] --------------------------------------------------....... - [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io async_io............... [NO] ...................... [NO][NO] -....... [NO] -transformer_inference .. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY][OKAY] - - -[OKAY]---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op nameop nameop name - ................................................op name installed installedinstalled ................ .... .. installedcompatiblecompatible compatible - -.. - ----------------------------------------------------------------------------------------------------compatible-------------------------------------------------- - - - --------------------------------------------------- -cpu_adam cpu_adamcpu_adam............... cpu_adam ............... ...............[NO] ............... [NO] [NO] [NO] ....... ....... ....... [OKAY] [OKAY].......[OKAY] - - -[OKAY] -fused_adam .............fused_adamfused_adam [NO].......................... fused_adam ....... [NO] [NO] [OKAY] ....... -....... ............. [OKAY]fused_lamb -[OKAY] [NO] -............. fused_lamb[NO] ...........................fused_lamb [OKAY] [NO][OKAY]............. - - .......[NO] [OKAY]....... -fused_lamb [OKAY] - ............. [NO]sparse_attn ................... [OKAY]sparse_attn - [NO]............ sparse_attn ....... [NO] ............[OKAY] -.......[NO] [OKAY]....... - transformer[OKAY] transformer -............ ............[NO] transformer[NO]....... ...................[OKAY] -[NO][OKAY]sparse_attn - .......stochastic_transformerstochastic_transformer............ [OKAY][NO].. - stochastic_transformer ....... [NO][NO] [OKAY]............... - [NO][OKAY][OKAY]transformer - -....... [OKAY] - ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -------------------------------------------------------------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja -JIT compiled ops requires ninja - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] - - ---------------------------------------------------[OKAY]-------------------------------------------------- --------------------------------------------------- - -op name -op name ................--------------------------------------------------op name................ - installedinstalled................ op name ..installed .. ................compatible.. -compatible -installed--------------------------------------------------compatible -------------------------------------------------- -.. - --------------------------------------------------- -compatible --------------------------------------------------- -cpu_adamcpu_adamcpu_adam ..............................cpu_adam ............... [NO][NO]............... ..............[NO][NO] [OKAY]..............[OKAY] - - [OKAY][OKAY] - -fused_adamfused_adam fused_adam .............fused_adam............. .............[NO].............[NO] .......[NO][NO]....... .......[OKAY].......[OKAY] - -[OKAY][OKAY] - -fused_lambfused_lambfused_lamb fused_lamb .................................................... [NO][NO][NO] [NO] ....... .............. .......[OKAY] [OKAY][OKAY] - -[OKAY] - -sparse_attnsparse_attn sparse_attn ........................sparse_attn............ ............[NO] [NO] [NO][NO] ....... ..................... [OKAY] [OKAY][OKAY] -[OKAY] - - -transformertransformertransformer transformer ............ ............ ........................ [NO] [NO] [NO] [NO]....... .......[OKAY].............. - [OKAY][OKAY][OKAY] - - -stochastic_transformer stochastic_transformerstochastic_transformerstochastic_transformer. .[NO]. . ....... [NO][NO] [OKAY][NO] ....... -....... ....... [OKAY] [OKAY] - -[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] - - -[OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op name -op nameop name op name ................ ................................................installed installedinstalledinstalled.. .. .... compatible compatible - -compatiblecompatible---------------------------------------------------------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adamcpu_adam cpu_adam............... ............... .............................. [NO] [NO] [NO][NO]....... ..............[OKAY]....... - [OKAY] [OKAY] -[OKAY] - -fused_adamfused_adam fused_adam..........................fused_adam [NO]..........................[NO] .......[NO] [NO].......[OKAY]....... - .......[OKAY][OKAY] - -fused_lamb[OKAY] -.............fused_lambfused_lamb fused_lamb [NO]............. ............. .............[NO]....... [NO].......[OKAY] [NO] - [OKAY].............. - [OKAY] -[OKAY] -sparse_attnsparse_attn sparse_attn............ sparse_attn ............[NO] ............ ............[NO].......[NO] [NO].......[OKAY]....... - .......[OKAY][OKAY] - -[OKAY]transformer -transformer transformer ........................transformer [NO]............[NO]............ ....... [NO][NO]....... [OKAY][OKAY].............. - -[OKAY] -[OKAY] -stochastic_transformerstochastic_transformer stochastic_transformerstochastic_transformer .. . . [NO] [NO] [NO][NO] ....... ....... .............. [OKAY] [OKAY] -[OKAY] -[OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... async_io[NO] - ............... [NO] ....... [NO] -transformer_inference .. [NO] .......transformer_inference [OKAY].. - [NO] .......utils [OKAY].................. - [NO] ....... utils[OKAY] -.................. [NO] quantizer....... ..............[OKAY] -[NO] ....... [OKAY]quantizer - .............. [NO]-------------------------------------------------- ....... - [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ...............async_io [NO] ....... ...............[NO] -[NO] ....... [NO] -transformer_inference .. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] -utils .................. [NO] utils....... ..................[OKAY] -[NO] ....... quantizer[OKAY] -.............. [NO] ....... quantizer[OKAY] -.............. [NO] --------------------------------------------------....... - [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io ............... [NO] ....... transformer_inference[NO] -.. [NO] ....... [OKAY] -utils transformer_inference.................. ..[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer ..............utils [NO].................. .......[NO] [OKAY]....... - [OKAY] ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> setting tensorboard ... -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-25 17:00:32,554] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' ->>> done with dataset index builder. Compilation time: 0.304 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 19.219 seconds -time to initialize megatron (seconds): 36.130 -[after megatron is initialized] datetime: 2021-10-25 17:00:52 -building GPT model ... -[2021-10-25 17:00:52,194] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-25 17:00:52,194] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-25 17:00:52,195] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.96 GB, percent = 21.3% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-25 17:00:53,947] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - - - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - - - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-25 17:00:54,643] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-25 17:00:54,644] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-25 17:00:54,644] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.13 GB, percent = 21.4% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-25 17:00:54,645] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+29bee73, git-hash=29bee73, git-branch=master -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-25 17:00:54,682] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-25 17:00:54,682] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-25 17:00:54,682] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-25 17:00:54,682] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-25 17:00:54,682] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-25 17:00:54,682] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-25 17:00:54,682] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-25 17:00:54,682] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-25 17:00:54,682] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-25 17:00:54,682] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 1.1361653804779053 seconds -Time to load utils op: 1.1446757316589355 secondsTime to load utils op: 1.1362073421478271 seconds - -Time to load utils op: 1.1447839736938477 seconds -Time to load utils op: 1.1396315097808838 seconds -Time to load utils op: 1.1358680725097656 seconds -Time to load utils op: 1.1367599964141846 secondsTime to load utils op: 1.1367418766021729 seconds - -Time to load utils op: 1.1404876708984375 secondsTime to load utils op: 1.141808032989502 seconds - -Time to load utils op: 1.1409213542938232 seconds -Time to load utils op: 1.142458438873291 seconds -Time to load utils op: 1.1421425342559814 secondsTime to load utils op: 1.1414263248443604 seconds - -Time to load utils op: 1.1379716396331787 seconds -Time to load utils op: 1.1451053619384766 seconds -Time to load utils op: 1.144970417022705 secondsTime to load utils op: 1.1489241123199463 secondsTime to load utils op: 1.1401455402374268 seconds - - -Time to load utils op: 1.1396701335906982 seconds -Time to load utils op: 1.1395044326782227 seconds -Time to load utils op: 1.1393463611602783 seconds -Time to load utils op: 1.1381773948669434 seconds -Time to load utils op: 1.1445457935333252 seconds -Time to load utils op: 1.1416923999786377 secondsTime to load utils op: 1.138444185256958 seconds - -Time to load utils op: 1.140937089920044 seconds -Time to load utils op: 1.1410527229309082 seconds -Time to load utils op: 1.1420345306396484 seconds -Time to load utils op: 1.1381704807281494 seconds -Time to load utils op: 1.1462643146514893 seconds -Time to load utils op: 1.1415565013885498 seconds -Time to load utils op: 1.1449415683746338 seconds -Time to load utils op: 1.148446798324585 seconds -Time to load utils op: 1.1411662101745605 seconds -Time to load utils op: 1.140681505203247 seconds -Time to load utils op: 1.1479206085205078 seconds -Time to load utils op: 1.146899700164795 seconds -Time to load utils op: 1.1452250480651855 seconds -Time to load utils op: 1.1466453075408936 seconds -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 1.1474223136901855 seconds -Time to load utils op: 1.151834487915039 seconds -Time to load utils op: 1.1512055397033691 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 1.16680908203125 secondsTime to load utils op: 1.164947509765625 seconds - -Time to load utils op: 1.1665103435516357 seconds -Time to load utils op: 1.1679704189300537 seconds -Time to load utils op: 1.1715693473815918 secondsTime to load utils op: 1.1677570343017578 secondsTime to load utils op: 1.172820806503296 secondsTime to load utils op: 1.178234338760376 seconds - - - -Time to load utils op: 1.1743216514587402 seconds -Time to load utils op: 1.17555832862854 seconds -Time to load utils op: 1.1737146377563477 seconds -Time to load utils op: 1.1703035831451416 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 1.1842231750488281 secondsTime to load utils op: 1.1826286315917969 seconds - -Time to load utils op: 1.1844096183776855 seconds -Time to load utils op: 1.1840767860412598 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 1.1932244300842285 seconds -Time to load utils op: 1.1940257549285889 seconds -Time to load utils op: 1.1905159950256348 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 1.1933000087738037 seconds -Time to load utils op: 1.194364309310913 seconds -Time to load utils op: 1.1911914348602295 seconds -Time to load utils op: 1.1936302185058594 seconds -Time to load utils op: 1.1992015838623047 seconds -Time to load utils op: 1.198223352432251 seconds -Time to load utils op: 1.1958019733428955 seconds -Time to load utils op: 1.1984355449676514 seconds -Time to load utils op: 1.1983428001403809 seconds -Time to load utils op: 1.1940734386444092 seconds -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 1.195770263671875 seconds -Time to load utils op: 1.202291488647461 seconds -Time to load utils op: 1.1995596885681152 seconds -Time to load utils op: 1.1961030960083008 seconds -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 1.192784070968628 seconds -Loading extension module utils... -Time to load utils op: 1.1894612312316895 seconds -Loading extension module utils... -Time to load utils op: 1.1929295063018799 seconds -Time to load utils op: 1.1925859451293945 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 1.2051942348480225 secondsTime to load utils op: 1.2054460048675537 seconds - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 1.1976251602172852 seconds -Time to load utils op: 1.1935901641845703 seconds -Time to load utils op: 1.201725959777832 seconds -Time to load utils op: 1.2013778686523438 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 1.197343349456787 seconds -Time to load utils op: 1.2021279335021973 secondsTime to load utils op: 1.2043585777282715 seconds -Time to load utils op: 1.2026984691619873 seconds - -Time to load utils op: 1.2041347026824951 seconds -Time to load utils op: 1.1991465091705322 seconds -Time to load utils op: 1.2049682140350342 seconds -Time to load utils op: 1.203838586807251 seconds -Time to load utils op: 1.201202154159546 seconds -Time to load utils op: 1.2040481567382812 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Time to load utils op: 1.109231948852539 secondsTime to load utils op: 1.1093459129333496 seconds - -Time to load utils op: 1.1179802417755127 seconds -Time to load utils op: 1.114788293838501 seconds -Time to load utils op: 1.2017755508422852 seconds -Time to load utils op: 1.2013468742370605 seconds -Time to load utils op: 1.2144229412078857 seconds -Time to load utils op: 1.203066110610962 seconds -Time to load utils op: 1.2113463878631592 secondsTime to load utils op: 1.2125275135040283 seconds - -Time to load utils op: 1.2068378925323486 seconds -Time to load utils op: 1.2114472389221191 seconds -Time to load utils op: 1.2083323001861572 seconds -Time to load utils op: 1.2159905433654785 secondsTime to load utils op: 1.2153520584106445 seconds - -Time to load utils op: 1.2075839042663574 seconds -Time to load utils op: 1.2165532112121582 secondsTime to load utils op: 1.2100391387939453 seconds - -Time to load utils op: 1.2124311923980713 seconds -Time to load utils op: 1.2169902324676514 seconds -Time to load utils op: 1.210597038269043 secondsTime to load utils op: 1.2096836566925049 seconds - -Time to load utils op: 1.2110040187835693 seconds -Time to load utils op: 1.2106988430023193 seconds -Time to load utils op: 1.2152173519134521 secondsTime to load utils op: 1.2158219814300537 seconds - -Time to load utils op: 1.2123937606811523 secondsTime to load utils op: 1.2151942253112793 seconds - -Time to load utils op: 1.1133010387420654 seconds -Time to load utils op: 1.1315340995788574 seconds -Time to load utils op: 1.1302077770233154 secondsTime to load utils op: 1.1229662895202637 seconds - -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.004731178283691406 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.005059957504272461 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0016143321990966797 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012383460998535156 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001367330551147461 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012295246124267578 seconds -Time to load utils op: 0.004750490188598633 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012745857238769531 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010285377502441406 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013928413391113281 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0012600421905517578 seconds -Time to load utils op: 0.0012857913970947266 seconds -No modifications detected for re-loaded extension module utils, skipping build step...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.0014793872833251953 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010845661163330078 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... - -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011010169982910156 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013785362243652344 seconds -Time to load utils op: 0.0012600421905517578 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.001220703125 seconds -Time to load utils op: 0.0011982917785644531 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011806488037109375 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013241767883300781 secondsTime to load utils op: 0.0014748573303222656 seconds - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012083053588867188 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001142740249633789 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0014545917510986328 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012912750244140625 seconds -Time to load utils op: 0.0011382102966308594 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011272430419921875 seconds -Time to load utils op: 0.0010271072387695312 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Time to load utils op: 0.0014488697052001953 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011904239654541016 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... - -Time to load utils op: 0.001050710678100586 seconds -Time to load utils op: 0.0010640621185302734 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011832714080810547 seconds -Time to load utils op: 0.0013349056243896484 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010159015655517578 seconds -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001958608627319336 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010900497436523438 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010204315185546875 seconds -Time to load utils op: 0.0013823509216308594 seconds -Time to load utils op: 0.0010464191436767578 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001142740249633789 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0013470649719238281 seconds -Time to load utils op: 0.0019409656524658203 seconds -Time to load utils op: 0.0012180805206298828 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0013074874877929688 seconds -Loading extension module utils... -Time to load utils op: 0.0011911392211914062 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.002157926559448242 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010383129119873047 seconds -Time to load utils op: 0.0019485950469970703 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0009944438934326172 seconds -Time to load utils op: 0.001720428466796875 seconds -Loading extension module utils... -Time to load utils op: 0.0020465850830078125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012235641479492188 seconds -Time to load utils op: 0.0021157264709472656 seconds -Time to load utils op: 0.0016856193542480469 seconds -Time to load utils op: 0.0019021034240722656 seconds -Time to load utils op: 0.0019292831420898438 seconds -Time to load utils op: 0.0011157989501953125 secondsTime to load utils op: 0.0010592937469482422 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001744985580444336 secondsTime to load utils op: 0.0016849040985107422 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011582374572753906 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0020804405212402344 seconds -Time to load utils op: 0.002135038375854492 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010581016540527344 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011475086212158203 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.0019292831420898438 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.00150299072265625 seconds -Loading extension module utils... -Time to load utils op: 0.001977682113647461 seconds -Time to load utils op: 0.002140045166015625 secondsTime to load utils op: 0.001988649368286133 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0014204978942871094 seconds -Time to load utils op: 0.0018057823181152344 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001382589340209961 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0015447139739990234 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011267662048339844 seconds -Time to load utils op: 0.0017805099487304688 seconds -Time to load utils op: 0.001306772232055664 seconds -Time to load utils op: 0.0018398761749267578 seconds -Time to load utils op: 0.0017178058624267578 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0017023086547851562 seconds -Time to load utils op: 0.001615762710571289 seconds -Time to load utils op: 0.0019032955169677734 seconds -Time to load utils op: 0.001617431640625 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0015304088592529297 secondsTime to load utils op: 0.0016951560974121094 seconds - -Time to load utils op: 0.0012946128845214844 secondsTime to load utils op: 0.0011167526245117188 seconds - -Time to load utils op: 0.001875162124633789 seconds -Time to load utils op: 0.0010035037994384766 seconds -Time to load utils op: 0.001435995101928711 seconds -Time to load utils op: 0.0009548664093017578 seconds -Time to load utils op: 0.002081632614135742 seconds -Time to load utils op: 0.0012013912200927734 seconds -Time to load utils op: 0.0020668506622314453 seconds -Time to load utils op: 0.0022614002227783203 seconds -Time to load utils op: 0.001992940902709961 seconds -Time to load utils op: 0.0018658638000488281 secondsTime to load utils op: 0.0019426345825195312 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001821756362915039 seconds -Time to load utils op: 0.0018794536590576172 seconds -Time to load utils op: 0.0010008811950683594 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0024580955505371094 seconds -Time to load utils op: 0.0022573471069335938 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0020515918731689453 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.00107574462890625 seconds -Time to load utils op: 0.002051830291748047 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0019366741180419922 seconds -Time to load utils op: 0.0020515918731689453 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0019481182098388672 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010149478912353516 seconds -Time to load utils op: 0.0009922981262207031 seconds -Time to load utils op: 0.0014662742614746094 secondsTime to load utils op: 0.0012874603271484375 seconds - -[2021-10-25 17:00:57,665] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -[2021-10-25 17:00:57,666] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -[2021-10-25 17:00:57,666] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.15 GB, percent = 21.4% -Time to load utils op: 0.003975868225097656 secondsTime to load utils op: 0.0039479732513427734 seconds - -Time to load utils op: 0.0042972564697265625 seconds -Time to load utils op: 0.0038938522338867188 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.009400129318237305 seconds -Time to load utils op: 0.009255409240722656 secondsTime to load utils op: 0.009211301803588867 seconds - -Time to load utils op: 0.009174108505249023 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.009624958038330078 seconds -Time to load utils op: 0.009574174880981445 seconds -Time to load utils op: 0.009578704833984375 seconds -Time to load utils op: 0.009686708450317383 seconds -[2021-10-25 17:00:57,716] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-25 17:00:57,717] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-25 17:00:57,717] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.15 GB, percent = 21.4% -[2021-10-25 17:00:57,717] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-25 17:00:57,750] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-25 17:00:57,751] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-25 17:00:57,751] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.15 GB, percent = 21.4% -[2021-10-25 17:00:57,751] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-25 17:00:57,751] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-25 17:00:57,751] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-25 17:00:57,751] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-25 17:00:57,751] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-25 17:00:57,751] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-25 17:00:57,751] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-25 17:00:57,751] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-25 17:00:57,751] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-25 17:00:57,751] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-25 17:00:57,751] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-25 17:00:57,752] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-25 17:00:57,753] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-25 17:00:57,753] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0007925033569335938 seconds -[2021-10-25 17:00:57,754] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,149] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-25 17:00:58,150] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) - > using checkpoint value 0.0001 for learning rate - > using checkpoint value 6e-06 for minimum learning rate - > using checkpoint value 3750000 for warmup iterations - > using checkpoint value 600000000 for total number of iterations - > using checkpoint value cosine for decay style -successfully loaded 1 ZeRO state_dicts for rank 96 -successfully loaded 1 ZeRO state_dicts for rank 90 -successfully loaded 1 ZeRO state_dicts for rank 41 -successfully loaded 1 ZeRO state_dicts for rank 36 -successfully loaded 1 ZeRO state_dicts for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 38 -successfully loaded 1 ZeRO state_dicts for rank 88 -successfully loaded 1 ZeRO state_dicts for rank 37 -successfully loaded 1 ZeRO state_dicts for rank 98 -successfully loaded 1 ZeRO state_dicts for rank 4 -successfully loaded 1 ZeRO state_dicts for rank 56 -successfully loaded 1 ZeRO state_dicts for rank 109 -successfully loaded 1 ZeRO state_dicts for rank 44 -successfully loaded 1 ZeRO state_dicts for rank 42 -successfully loaded 1 ZeRO state_dicts for rank 43 -successfully loaded 1 ZeRO state_dicts for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 59 -successfully loaded 1 ZeRO state_dicts for rank 94 -successfully loaded 1 ZeRO state_dicts for rank 7 -successfully loaded 1 ZeRO state_dicts for rank 34 -successfully loaded 1 ZeRO state_dicts for rank 32 -loading 1 zero partition checkpoints for rank 41 -successfully loaded 1 ZeRO state_dicts for rank 45 -successfully loaded 1 ZeRO state_dicts for rank 103 -successfully loaded 1 ZeRO state_dicts for rank 65 -successfully loaded 1 ZeRO state_dicts for rank 60 -loading 1 zero partition checkpoints for rank 38 -loading 1 zero partition checkpoints for rank 36 -successfully loaded 1 ZeRO state_dicts for rank 46 -loading 1 zero partition checkpoints for rank 96 -successfully loaded 1 ZeRO state_dicts for rank 33 -loading 1 zero partition checkpoints for rank 90 -successfully loaded 1 ZeRO state_dicts for rank 101 -successfully loaded 1 ZeRO state_dicts for rank 40 -loading 1 zero partition checkpoints for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 110 -loading 1 zero partition checkpoints for rank 4 -loading 1 zero partition checkpoints for rank 98 -loading 1 zero partition checkpoints for rank 94 -loading 1 zero partition checkpoints for rank 56 -successfully loaded 1 ZeRO state_dicts for rank 39 -loading 1 zero partition checkpoints for rank 43 -successfully loaded 1 ZeRO state_dicts for rank 27 -successfully loaded 1 ZeRO state_dicts for rank 24 -loading 1 zero partition checkpoints for rank 7 -successfully loaded 1 ZeRO state_dicts for rank 86 -successfully loaded 1 ZeRO state_dicts for rank 91 -successfully loaded 1 ZeRO state_dicts for rank 120 -successfully loaded 1 ZeRO state_dicts for rank 108 -successfully loaded 1 ZeRO state_dicts for rank 113 -successfully loaded 1 ZeRO state_dicts for rank 84 -successfully loaded 1 ZeRO state_dicts for rank 97 -loading 1 zero partition checkpoints for rank 34 -successfully loaded 1 ZeRO state_dicts for rank 80 -successfully loaded 1 ZeRO state_dicts for rank 54 -loading 1 zero partition checkpoints for rank 88 -successfully loaded 1 ZeRO state_dicts for rank 105 -successfully loaded 1 ZeRO state_dicts for rank 104 -successfully loaded 1 ZeRO state_dicts for rank 78 -successfully loaded 1 ZeRO state_dicts for rank 26 -successfully loaded 1 ZeRO state_dicts for rank 57 -successfully loaded 1 ZeRO state_dicts for rank 58 -loading 1 zero partition checkpoints for rank 37 -successfully loaded 1 ZeRO state_dicts for rank 121 -successfully loaded 1 ZeRO state_dicts for rank 64 -successfully loaded 1 ZeRO state_dicts for rank 82 -successfully loaded 1 ZeRO state_dicts for rank 52 -successfully loaded 1 ZeRO state_dicts for rank 102 -successfully loaded 1 ZeRO state_dicts for rank 30 -successfully loaded 1 ZeRO state_dicts for rank 20 -successfully loaded 1 ZeRO state_dicts for rank 89 -successfully loaded 1 ZeRO state_dicts for rank 72 -loading 1 zero partition checkpoints for rank 45 -successfully loaded 1 ZeRO state_dicts for rank 48 -successfully loaded 1 ZeRO state_dicts for rank 29 -loading 1 zero partition checkpoints for rank 109 -loading 1 zero partition checkpoints for rank 42 -loading 1 zero partition checkpoints for rank 59 -loading 1 zero partition checkpoints for rank 44 -loading 1 zero partition checkpoints for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 71 -successfully loaded 1 ZeRO state_dicts for rank 6 -loading 1 zero partition checkpoints for rank 32 -successfully loaded 1 ZeRO state_dicts for rank 53 -successfully loaded 1 ZeRO state_dicts for rank 93 -loading 1 zero partition checkpoints for rank 103 -loading 1 zero partition checkpoints for rank 65 -loading 1 zero partition checkpoints for rank 60 -successfully loaded 1 ZeRO state_dicts for rank 47 -loading 1 zero partition checkpoints for rank 27 -successfully loaded 1 ZeRO state_dicts for rank 112 -loading 1 zero partition checkpoints for rank 46 -successfully loaded 1 ZeRO state_dicts for rank 79 -successfully loaded 1 ZeRO state_dicts for rank 51 -successfully loaded 1 ZeRO state_dicts for rank 49 -successfully loaded 1 ZeRO state_dicts for rank 35 -successfully loaded 1 ZeRO state_dicts for rank 55 -successfully loaded 1 ZeRO state_dicts for rank 87 -loading 1 zero partition checkpoints for rank 84 -loading 1 zero partition checkpoints for rank 108 -loading 1 zero partition checkpoints for rank 101 -loading 1 zero partition checkpoints for rank 33 -successfully loaded 1 ZeRO state_dicts for rank 25 -loading 1 zero partition checkpoints for rank 40 -successfully loaded 1 ZeRO state_dicts for rank 28 -loading 1 zero partition checkpoints for rank 97 -loading 1 zero partition checkpoints for rank 113 -successfully loaded 1 ZeRO state_dicts for rank 114 -loading 1 zero partition checkpoints for rank 120 -successfully loaded 1 ZeRO state_dicts for rank 77 -loading 1 zero partition checkpoints for rank 58 -successfully loaded 1 ZeRO state_dicts for rank 111 -successfully loaded 1 ZeRO state_dicts for rank 76 -successfully loaded 1 ZeRO state_dicts for rank 5 -successfully loaded 1 ZeRO state_dicts for rank 69 -loading 1 zero partition checkpoints for rank 64 -successfully loaded 1 ZeRO state_dicts for rank 122 -successfully loaded 1 ZeRO state_dicts for rank 115 -loading 1 zero partition checkpoints for rank 89 -successfully loaded 1 ZeRO state_dicts for rank 21 -loading 1 zero partition checkpoints for rank 102 -loading 1 zero partition checkpoints for rank 110 -loading 1 zero partition checkpoints for rank 26 -successfully loaded 1 ZeRO state_dicts for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 22 -loading 1 zero partition checkpoints for rank 39 -successfully loaded 1 ZeRO state_dicts for rank 50 -successfully loaded 1 ZeRO state_dicts for rank 116 -successfully loaded 1 ZeRO state_dicts for rank 66 -loading 1 zero partition checkpoints for rank 121 -successfully loaded 1 ZeRO state_dicts for rank 0 -successfully loaded 1 ZeRO state_dicts for rank 73 -loading 1 zero partition checkpoints for rank 52 -loading 1 zero partition checkpoints for rank 29 -loading 1 zero partition checkpoints for rank 71 -successfully loaded 1 ZeRO state_dicts for rank 107 -loading 1 zero partition checkpoints for rank 24 -loading 1 zero partition checkpoints for rank 104 -loading 1 zero partition checkpoints for rank 86 -loading 1 zero partition checkpoints for rank 82 -successfully loaded 1 ZeRO state_dicts for rank 23 -successfully loaded 1 ZeRO state_dicts for rank 70 -successfully loaded 1 ZeRO state_dicts for rank 117 -loading 1 zero partition checkpoints for rank 91 -successfully loaded 1 ZeRO state_dicts for rank 62 -successfully loaded 1 ZeRO state_dicts for rank 61 -loading 1 zero partition checkpoints for rank 80 -successfully loaded 1 ZeRO state_dicts for rank 10 -loading 1 zero partition checkpoints for rank 53 -loading 1 zero partition checkpoints for rank 105 -loading 1 zero partition checkpoints for rank 54 -successfully loaded 1 ZeRO state_dicts for rank 106 -successfully loaded 1 ZeRO state_dicts for rank 119 -loading 1 zero partition checkpoints for rank 47 -loading 1 zero partition checkpoints for rank 78 -loading 1 zero partition checkpoints for rank 57 -successfully loaded 1 ZeRO state_dicts for rank 81 -successfully loaded 1 ZeRO state_dicts for rank 63 -successfully loaded 1 ZeRO state_dicts for rank 83 -loading 1 zero partition checkpoints for rank 35 -successfully loaded 1 ZeRO state_dicts for rank 3 -loading 1 zero partition checkpoints for rank 20 -loading 1 zero partition checkpoints for rank 30 -loading 1 zero partition checkpoints for rank 72 -loading 1 zero partition checkpoints for rank 79 -loading 1 zero partition checkpoints for rank 87 -successfully loaded 1 ZeRO state_dicts for rank 14 -successfully loaded 1 ZeRO state_dicts for rank 31 -successfully loaded 1 ZeRO state_dicts for rank 11 -loading 1 zero partition checkpoints for rank 112 -loading 1 zero partition checkpoints for rank 48 -successfully loaded 1 ZeRO state_dicts for rank 92 -successfully loaded 1 ZeRO state_dicts for rank 75 -loading 1 zero partition checkpoints for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 85 -successfully loaded 1 ZeRO state_dicts for rank 1 -loading 1 zero partition checkpoints for rank 6 -successfully loaded 1 ZeRO state_dicts for rank 123 -loading 1 zero partition checkpoints for rank 49 -successfully loaded 1 ZeRO state_dicts for rank 2 -loading 1 zero partition checkpoints for rank 93 -loading 1 zero partition checkpoints for rank 76 -loading 1 zero partition checkpoints for rank 69 -loading 1 zero partition checkpoints for rank 22 -successfully loaded 1 ZeRO state_dicts for rank 13 -successfully loaded 1 ZeRO state_dicts for rank 12 -loading 1 zero partition checkpoints for rank 73 -loading 1 zero partition checkpoints for rank 50 -successfully loaded 1 ZeRO state_dicts for rank 9 -successfully loaded 1 ZeRO state_dicts for rank 95 -loading 1 zero partition checkpoints for rank 66 -successfully loaded 1 ZeRO state_dicts for rank 74 -loading 1 zero partition checkpoints for rank 116 -loading 1 zero partition checkpoints for rank 55 -successfully loaded 1 ZeRO state_dicts for rank 8 -loading 1 zero partition checkpoints for rank 51 -loading 1 zero partition checkpoints for rank 25 -loading 1 zero partition checkpoints for rank 21 -loading 1 zero partition checkpoints for rank 0 -loading 1 zero partition checkpoints for rank 28 - checkpoint version 3.0 -loading 1 zero partition checkpoints for rank 114 -loading 1 zero partition checkpoints for rank 77 -successfully loaded 1 ZeRO state_dicts for rank 118 -loading 1 zero partition checkpoints for rank 5 -loading 1 zero partition checkpoints for rank 111 -loading 1 zero partition checkpoints for rank 117 -loading 1 zero partition checkpoints for rank 83 -successfully loaded 1 ZeRO state_dicts for rank 15 -loading 1 zero partition checkpoints for rank 122 -successfully loaded 1 ZeRO state_dicts for rank 124 -loading 1 zero partition checkpoints for rank 62 -loading 1 zero partition checkpoints for rank 106 -loading 1 zero partition checkpoints for rank 115 -loading 1 zero partition checkpoints for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 125 -loading 1 zero partition checkpoints for rank 63 -loading 1 zero partition checkpoints for rank 85 -loading 1 zero partition checkpoints for rank 75 -successfully loaded 1 ZeRO state_dicts for rank 127 -loading 1 zero partition checkpoints for rank 31 -successfully loaded 1 ZeRO state_dicts for rank 16 -loading 1 zero partition checkpoints for rank 107 -loading 1 zero partition checkpoints for rank 11 -loading 1 zero partition checkpoints for rank 23 -loading 1 zero partition checkpoints for rank 70 -loading 1 zero partition checkpoints for rank 61 -loading 1 zero partition checkpoints for rank 13 -loading 1 zero partition checkpoints for rank 10 -loading 1 zero partition checkpoints for rank 1 -loading 1 zero partition checkpoints for rank 119 -loading 1 zero partition checkpoints for rank 2 -loading 1 zero partition checkpoints for rank 9 -loading 1 zero partition checkpoints for rank 81 -loading 1 zero partition checkpoints for rank 14 -successfully loaded 1 ZeRO state_dicts for rank 126 -loading 1 zero partition checkpoints for rank 92 -loading 1 zero partition checkpoints for rank 123 -loading 1 zero partition checkpoints for rank 3 -loading 1 zero partition checkpoints for rank 15 -loading 1 zero partition checkpoints for rank 12 -loading 1 zero partition checkpoints for rank 16 -loading 1 zero partition checkpoints for rank 74 -loading 1 zero partition checkpoints for rank 95 -loading 1 zero partition checkpoints for rank 8 -loading 1 zero partition checkpoints for rank 118 -loading 1 zero partition checkpoints for rank 127 -loading 1 zero partition checkpoints for rank 124 -loading 1 zero partition checkpoints for rank 125 -loading 1 zero partition checkpoints for rank 126 -successfully loaded 1 ZeRO state_dicts for rank 19 -loading 1 zero partition checkpoints for rank 19 -successfully loaded 1 ZeRO state_dicts for rank 17 -successfully loaded 1 ZeRO state_dicts for rank 18 -loading 1 zero partition checkpoints for rank 17 -loading 1 zero partition checkpoints for rank 18 - successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints at iteration 641 -time (ms) | load-checkpoint: 39142.17 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 125.22432 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.368064 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.368064 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.368064 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.368064 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-25 17:01:37 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 20008960 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.155938 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.341 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.239 seconds - total number of samples: 20781483 - total number of epochs: 3 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.081 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-25 17:01:45 -done with setup ... -time (ms) | model-and-optimizer-setup: 45246.43 | train/valid/test-data-iterators-setup: 6750.27 -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -training ... -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billionNumber of parameters: 125.22432 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-25 17:01:45 -[2021-10-25 17:01:45,673] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-25 17:01:45,673] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-25 17:01:45,673] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-25 17:01:45,673] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-25 17:01:45,673] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -[Rank 1] (after 642 iterations) memory (MB) | allocated: 13205.4814453125 | max allocated: 20669.0302734375 | reserved: 24428.0 | max reserved: 24428.0 -[Rank 5] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 125] (after 642 iterations) memory (MB) | allocated: 13088.81005859375 | max allocated: 20552.416015625 | reserved: 24408.0 | max reserved: 24408.0 -[Rank 9] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 13] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 17] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 25] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 33] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 29] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 21] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 4] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 8] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 0] (after 642 iterations) memory (MB) | allocated: 13203.4814453125 | max allocated: 20667.0302734375 | reserved: 24428.0 | max reserved: 24428.0 -[Rank 16] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 12] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 124] (after 642 iterations) memory (MB) | allocated: 13088.41748046875 | max allocated: 20552.0234375 | reserved: 24408.0 | max reserved: 24408.0 -[Rank 28] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 32] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 24] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 20] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 45] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 41] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 49] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 37] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 53] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 57] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 61] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 65] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 2] (after 642 iterations) memory (MB) | allocated: 13203.2490234375 | max allocated: 20666.7978515625 | reserved: 24428.0 | max reserved: 24428.0 -[Rank 3] (after 642 iterations) memory (MB) | allocated: 13204.06298828125 | max allocated: 20667.61181640625 | reserved: 24428.0 | max reserved: 24428.0 -[Rank 44] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 40] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 126] (after 642 iterations) memory (MB) | allocated: 13089.05810546875 | max allocated: 20552.6640625 | reserved: 24408.0 | max reserved: 24408.0 -[Rank 14] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 48] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 6] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 7] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 15] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 60] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 56] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 11] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 36] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 18] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 10] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 19] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 22] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 23] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 27] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 64] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 31] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 26] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 30] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 39] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 38] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 42] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 35] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0[Rank 34] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 - -[Rank 46] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 47] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 43] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 52] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 51] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 50] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 55] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 58] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 54] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 59] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 67] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 62] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 63] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 66] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 71] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 83] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 82] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 81] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 69] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0[Rank 70] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 80] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 - -[Rank 76] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 68] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 77] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0[Rank 79] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0[Rank 78] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0 - - -[Rank 90] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 91] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 72] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0[Rank 73] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0 - -[Rank 75] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0[Rank 74] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 87] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0[Rank 85] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0[Rank 86] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 - - -[Rank 84] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 - -[Rank 88] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 94] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 89] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 99] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 93] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 92] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 95] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 97] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 98] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0[Rank 96] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 - -[Rank 113] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 114] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 103] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 102] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 119] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 115] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 101] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 118] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0[Rank 117] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 - -[Rank 109] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 110] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0[Rank 108] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 116] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 112] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 - -[Rank 107] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 106] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 111] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 105] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 121] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20076.0 | max reserved: 20076.0[Rank 123] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20076.0 | max reserved: 20076.0 -[Rank 122] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20076.0 | max reserved: 20076.0[Rank 120] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20076.0 | max reserved: 20076.0 - - -[Rank 100] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 104] (after 642 iterations) memory (MB) | allocated: 10787.46826171875 | max allocated: 16947.64990234375 | reserved: 20078.0 | max reserved: 20078.0 - iteration 642/ 292968 | consumed samples: 1314816 | consumed tokens: 102465536 | elapsed time per iteration (ms): 219505.1 | learning rate: 3.506E-05 | global batch size: 2048 | lm loss: 5.334343E+00 | loss scale: 8192.0 | grad norm: 10636.967 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -[Rank 127] (after 642 iterations) memory (MB) | allocated: 13088.81005859375 | max allocated: 20552.416015625 | reserved: 24408.0 | max reserved: 24408.0 -time (ms) - iteration 643/ 292968 | consumed samples: 1316864 | consumed tokens: 102662144 | elapsed time per iteration (ms): 125892.0 | learning rate: 3.512E-05 | global batch size: 2048 | lm loss: 5.412467E+00 | loss scale: 8192.0 | grad norm: 20871.669 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 644/ 292968 | consumed samples: 1318912 | consumed tokens: 102858752 | elapsed time per iteration (ms): 133757.3 | learning rate: 3.517E-05 | global batch size: 2048 | lm loss: 5.394762E+00 | loss scale: 8192.0 | grad norm: 15610.886 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 645/ 292968 | consumed samples: 1320960 | consumed tokens: 103055360 | elapsed time per iteration (ms): 130868.8 | learning rate: 3.523E-05 | global batch size: 2048 | lm loss: 5.368480E+00 | loss scale: 8192.0 | grad norm: 14600.618 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 646/ 292968 | consumed samples: 1323008 | consumed tokens: 103251968 | elapsed time per iteration (ms): 132321.1 | learning rate: 3.528E-05 | global batch size: 2048 | lm loss: 5.398826E+00 | loss scale: 8192.0 | grad norm: 24473.005 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 647/ 292968 | consumed samples: 1325056 | consumed tokens: 103448576 | elapsed time per iteration (ms): 122887.4 | learning rate: 3.533E-05 | global batch size: 2048 | lm loss: 5.350785E+00 | loss scale: 8192.0 | grad norm: 11410.247 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 648/ 292968 | consumed samples: 1327104 | consumed tokens: 103645184 | elapsed time per iteration (ms): 134163.1 | learning rate: 3.539E-05 | global batch size: 2048 | lm loss: 5.330161E+00 | loss scale: 8192.0 | grad norm: 12625.897 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 649/ 292968 | consumed samples: 1329152 | consumed tokens: 103841792 | elapsed time per iteration (ms): 130944.1 | learning rate: 3.544E-05 | global batch size: 2048 | lm loss: 5.289292E+00 | loss scale: 8192.0 | grad norm: 8915.660 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 650/ 292968 | consumed samples: 1331200 | consumed tokens: 104038400 | elapsed time per iteration (ms): 130923.1 | learning rate: 3.550E-05 | global batch size: 2048 | lm loss: 5.305474E+00 | loss scale: 8192.0 | grad norm: 9889.439 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 651/ 292968 | consumed samples: 1333248 | consumed tokens: 104235008 | elapsed time per iteration (ms): 143156.3 | learning rate: 3.555E-05 | global batch size: 2048 | lm loss: 5.318254E+00 | loss scale: 8192.0 | grad norm: 9110.004 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 652/ 292968 | consumed samples: 1335296 | consumed tokens: 104431616 | elapsed time per iteration (ms): 146926.8 | learning rate: 3.561E-05 | global batch size: 2048 | lm loss: 5.282621E+00 | loss scale: 8192.0 | grad norm: 8615.451 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 653/ 292968 | consumed samples: 1337344 | consumed tokens: 104628224 | elapsed time per iteration (ms): 143730.1 | learning rate: 3.566E-05 | global batch size: 2048 | lm loss: 5.316740E+00 | loss scale: 8192.0 | grad norm: 9280.621 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 654/ 292968 | consumed samples: 1339392 | consumed tokens: 104824832 | elapsed time per iteration (ms): 154616.2 | learning rate: 3.572E-05 | global batch size: 2048 | lm loss: 5.274152E+00 | loss scale: 8192.0 | grad norm: 8229.109 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 655/ 292968 | consumed samples: 1341440 | consumed tokens: 105021440 | elapsed time per iteration (ms): 143075.0 | learning rate: 3.577E-05 | global batch size: 2048 | lm loss: 5.310796E+00 | loss scale: 8192.0 | grad norm: 10539.644 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 656/ 292968 | consumed samples: 1343488 | consumed tokens: 105218048 | elapsed time per iteration (ms): 148820.9 | learning rate: 3.583E-05 | global batch size: 2048 | lm loss: 5.310678E+00 | loss scale: 8192.0 | grad norm: 9044.385 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 657/ 292968 | consumed samples: 1345536 | consumed tokens: 105414656 | elapsed time per iteration (ms): 136602.8 | learning rate: 3.588E-05 | global batch size: 2048 | lm loss: 5.289979E+00 | loss scale: 8192.0 | grad norm: 10719.767 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 658/ 292968 | consumed samples: 1347584 | consumed tokens: 105611264 | elapsed time per iteration (ms): 143776.9 | learning rate: 3.594E-05 | global batch size: 2048 | lm loss: 5.292214E+00 | loss scale: 8192.0 | grad norm: 9126.406 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 659/ 292968 | consumed samples: 1349632 | consumed tokens: 105807872 | elapsed time per iteration (ms): 137603.3 | learning rate: 3.599E-05 | global batch size: 2048 | lm loss: 5.286619E+00 | loss scale: 8192.0 | grad norm: 10887.119 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 660/ 292968 | consumed samples: 1351680 | consumed tokens: 106004480 | elapsed time per iteration (ms): 130752.7 | learning rate: 3.604E-05 | global batch size: 2048 | lm loss: 5.256087E+00 | loss scale: 8192.0 | grad norm: 9150.245 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 661/ 292968 | consumed samples: 1353728 | consumed tokens: 106201088 | elapsed time per iteration (ms): 120641.9 | learning rate: 3.610E-05 | global batch size: 2048 | lm loss: 5.249431E+00 | loss scale: 8192.0 | grad norm: 7508.986 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 662/ 292968 | consumed samples: 1355776 | consumed tokens: 106397696 | elapsed time per iteration (ms): 131900.7 | learning rate: 3.615E-05 | global batch size: 2048 | lm loss: 5.240894E+00 | loss scale: 8192.0 | grad norm: 8622.773 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 663/ 292968 | consumed samples: 1357824 | consumed tokens: 106594304 | elapsed time per iteration (ms): 125828.3 | learning rate: 3.621E-05 | global batch size: 2048 | lm loss: 5.258747E+00 | loss scale: 8192.0 | grad norm: 9476.512 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 664/ 292968 | consumed samples: 1359872 | consumed tokens: 106790912 | elapsed time per iteration (ms): 126588.1 | learning rate: 3.626E-05 | global batch size: 2048 | lm loss: 5.267451E+00 | loss scale: 8192.0 | grad norm: 8741.716 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 665/ 292968 | consumed samples: 1361920 | consumed tokens: 106987520 | elapsed time per iteration (ms): 119796.7 | learning rate: 3.632E-05 | global batch size: 2048 | lm loss: 5.252110E+00 | loss scale: 8192.0 | grad norm: 9103.028 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 666/ 292968 | consumed samples: 1363968 | consumed tokens: 107184128 | elapsed time per iteration (ms): 117112.6 | learning rate: 3.637E-05 | global batch size: 2048 | lm loss: 5.229414E+00 | loss scale: 8192.0 | grad norm: 7841.873 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 667/ 292968 | consumed samples: 1366016 | consumed tokens: 107380736 | elapsed time per iteration (ms): 106663.0 | learning rate: 3.643E-05 | global batch size: 2048 | lm loss: 5.272611E+00 | loss scale: 8192.0 | grad norm: 9170.979 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 668/ 292968 | consumed samples: 1368064 | consumed tokens: 107577344 | elapsed time per iteration (ms): 103394.3 | learning rate: 3.648E-05 | global batch size: 2048 | lm loss: 5.227648E+00 | loss scale: 8192.0 | grad norm: 11054.814 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 669/ 292968 | consumed samples: 1370112 | consumed tokens: 107773952 | elapsed time per iteration (ms): 104189.3 | learning rate: 3.654E-05 | global batch size: 2048 | lm loss: 5.247322E+00 | loss scale: 8192.0 | grad norm: 8504.236 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 670/ 292968 | consumed samples: 1372160 | consumed tokens: 107970560 | elapsed time per iteration (ms): 104303.8 | learning rate: 3.659E-05 | global batch size: 2048 | lm loss: 5.244978E+00 | loss scale: 8192.0 | grad norm: 12015.048 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 671/ 292968 | consumed samples: 1374208 | consumed tokens: 108167168 | elapsed time per iteration (ms): 107228.3 | learning rate: 3.665E-05 | global batch size: 2048 | lm loss: 5.243213E+00 | loss scale: 8192.0 | grad norm: 8404.357 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 672/ 292968 | consumed samples: 1376256 | consumed tokens: 108363776 | elapsed time per iteration (ms): 107397.2 | learning rate: 3.670E-05 | global batch size: 2048 | lm loss: 5.233768E+00 | loss scale: 8192.0 | grad norm: 10867.712 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 673/ 292968 | consumed samples: 1378304 | consumed tokens: 108560384 | elapsed time per iteration (ms): 111663.4 | learning rate: 3.675E-05 | global batch size: 2048 | lm loss: 5.218716E+00 | loss scale: 8192.0 | grad norm: 9968.809 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 674/ 292968 | consumed samples: 1380352 | consumed tokens: 108756992 | elapsed time per iteration (ms): 101274.8 | learning rate: 3.681E-05 | global batch size: 2048 | lm loss: 5.234522E+00 | loss scale: 8192.0 | grad norm: 8131.011 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 675/ 292968 | consumed samples: 1382400 | consumed tokens: 108953600 | elapsed time per iteration (ms): 102827.3 | learning rate: 3.686E-05 | global batch size: 2048 | lm loss: 5.226708E+00 | loss scale: 8192.0 | grad norm: 10593.331 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 676/ 292968 | consumed samples: 1384448 | consumed tokens: 109150208 | elapsed time per iteration (ms): 109892.5 | learning rate: 3.692E-05 | global batch size: 2048 | lm loss: 5.231604E+00 | loss scale: 8192.0 | grad norm: 9093.235 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 677/ 292968 | consumed samples: 1386496 | consumed tokens: 109346816 | elapsed time per iteration (ms): 117143.9 | learning rate: 3.697E-05 | global batch size: 2048 | lm loss: 5.218035E+00 | loss scale: 8192.0 | grad norm: 10583.202 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 678/ 292968 | consumed samples: 1388544 | consumed tokens: 109543424 | elapsed time per iteration (ms): 143029.1 | learning rate: 3.703E-05 | global batch size: 2048 | lm loss: 5.212083E+00 | loss scale: 8192.0 | grad norm: 9427.938 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 679/ 292968 | consumed samples: 1390592 | consumed tokens: 109740032 | elapsed time per iteration (ms): 127496.8 | learning rate: 3.708E-05 | global batch size: 2048 | lm loss: 5.222923E+00 | loss scale: 8192.0 | grad norm: 10467.949 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 680/ 292968 | consumed samples: 1392640 | consumed tokens: 109936640 | elapsed time per iteration (ms): 125946.7 | learning rate: 3.714E-05 | global batch size: 2048 | lm loss: 5.200369E+00 | loss scale: 8192.0 | grad norm: 9287.753 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 681/ 292968 | consumed samples: 1394688 | consumed tokens: 110133248 | elapsed time per iteration (ms): 120027.8 | learning rate: 3.719E-05 | global batch size: 2048 | lm loss: 5.186337E+00 | loss scale: 8192.0 | grad norm: 8230.043 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 682/ 292968 | consumed samples: 1396736 | consumed tokens: 110329856 | elapsed time per iteration (ms): 127461.8 | learning rate: 3.725E-05 | global batch size: 2048 | lm loss: 5.208741E+00 | loss scale: 8192.0 | grad norm: 8618.723 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 683/ 292968 | consumed samples: 1398784 | consumed tokens: 110526464 | elapsed time per iteration (ms): 116420.5 | learning rate: 3.730E-05 | global batch size: 2048 | lm loss: 5.182314E+00 | loss scale: 8192.0 | grad norm: 8953.065 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 684/ 292968 | consumed samples: 1400832 | consumed tokens: 110723072 | elapsed time per iteration (ms): 109314.1 | learning rate: 3.736E-05 | global batch size: 2048 | lm loss: 5.253952E+00 | loss scale: 8192.0 | grad norm: 10873.328 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 685/ 292968 | consumed samples: 1402880 | consumed tokens: 110919680 | elapsed time per iteration (ms): 119842.1 | learning rate: 3.741E-05 | global batch size: 2048 | lm loss: 5.213473E+00 | loss scale: 8192.0 | grad norm: 9054.660 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 686/ 292968 | consumed samples: 1404928 | consumed tokens: 111116288 | elapsed time per iteration (ms): 112609.1 | learning rate: 3.746E-05 | global batch size: 2048 | lm loss: 5.200142E+00 | loss scale: 8192.0 | grad norm: 9041.503 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 687/ 292968 | consumed samples: 1406976 | consumed tokens: 111312896 | elapsed time per iteration (ms): 117520.3 | learning rate: 3.752E-05 | global batch size: 2048 | lm loss: 5.176431E+00 | loss scale: 8192.0 | grad norm: 11055.788 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 688/ 292968 | consumed samples: 1409024 | consumed tokens: 111509504 | elapsed time per iteration (ms): 118007.9 | learning rate: 3.757E-05 | global batch size: 2048 | lm loss: 5.179708E+00 | loss scale: 8192.0 | grad norm: 7957.756 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 689/ 292968 | consumed samples: 1411072 | consumed tokens: 111706112 | elapsed time per iteration (ms): 119866.8 | learning rate: 3.763E-05 | global batch size: 2048 | lm loss: 5.189474E+00 | loss scale: 8192.0 | grad norm: 9694.000 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 690/ 292968 | consumed samples: 1413120 | consumed tokens: 111902720 | elapsed time per iteration (ms): 110605.3 | learning rate: 3.768E-05 | global batch size: 2048 | lm loss: 5.201509E+00 | loss scale: 8192.0 | grad norm: 9995.050 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 691/ 292968 | consumed samples: 1415168 | consumed tokens: 112099328 | elapsed time per iteration (ms): 103655.2 | learning rate: 3.774E-05 | global batch size: 2048 | lm loss: 5.223563E+00 | loss scale: 8192.0 | grad norm: 9601.400 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 692/ 292968 | consumed samples: 1417216 | consumed tokens: 112295936 | elapsed time per iteration (ms): 108755.8 | learning rate: 3.779E-05 | global batch size: 2048 | lm loss: 5.166238E+00 | loss scale: 8192.0 | grad norm: 10625.566 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 693/ 292968 | consumed samples: 1419264 | consumed tokens: 112492544 | elapsed time per iteration (ms): 102372.7 | learning rate: 3.785E-05 | global batch size: 2048 | lm loss: 5.190458E+00 | loss scale: 8192.0 | grad norm: 11533.432 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 694/ 292968 | consumed samples: 1421312 | consumed tokens: 112689152 | elapsed time per iteration (ms): 110113.4 | learning rate: 3.790E-05 | global batch size: 2048 | lm loss: 5.202763E+00 | loss scale: 8192.0 | grad norm: 9628.399 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 695/ 292968 | consumed samples: 1423360 | consumed tokens: 112885760 | elapsed time per iteration (ms): 102040.5 | learning rate: 3.796E-05 | global batch size: 2048 | lm loss: 5.170166E+00 | loss scale: 8192.0 | grad norm: 10944.866 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 696/ 292968 | consumed samples: 1425408 | consumed tokens: 113082368 | elapsed time per iteration (ms): 97553.7 | learning rate: 3.801E-05 | global batch size: 2048 | lm loss: 5.176034E+00 | loss scale: 8192.0 | grad norm: 12551.502 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 697/ 292968 | consumed samples: 1427456 | consumed tokens: 113278976 | elapsed time per iteration (ms): 100107.5 | learning rate: 3.807E-05 | global batch size: 2048 | lm loss: 5.146069E+00 | loss scale: 8192.0 | grad norm: 6782.441 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 698/ 292968 | consumed samples: 1429504 | consumed tokens: 113475584 | elapsed time per iteration (ms): 109688.9 | learning rate: 3.812E-05 | global batch size: 2048 | lm loss: 5.172399E+00 | loss scale: 8192.0 | grad norm: 12811.933 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 699/ 292968 | consumed samples: 1431552 | consumed tokens: 113672192 | elapsed time per iteration (ms): 109547.5 | learning rate: 3.817E-05 | global batch size: 2048 | lm loss: 5.165838E+00 | loss scale: 8192.0 | grad norm: 13686.515 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 700/ 292968 | consumed samples: 1433600 | consumed tokens: 113868800 | elapsed time per iteration (ms): 113219.4 | learning rate: 3.823E-05 | global batch size: 2048 | lm loss: 5.186374E+00 | loss scale: 8192.0 | grad norm: 8076.695 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 701/ 292968 | consumed samples: 1435648 | consumed tokens: 114065408 | elapsed time per iteration (ms): 126789.6 | learning rate: 3.828E-05 | global batch size: 2048 | lm loss: 5.157846E+00 | loss scale: 8192.0 | grad norm: 13178.728 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 702/ 292968 | consumed samples: 1437696 | consumed tokens: 114262016 | elapsed time per iteration (ms): 115190.8 | learning rate: 3.834E-05 | global batch size: 2048 | lm loss: 5.191998E+00 | loss scale: 8192.0 | grad norm: 9035.968 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 703/ 292968 | consumed samples: 1439744 | consumed tokens: 114458624 | elapsed time per iteration (ms): 112187.6 | learning rate: 3.839E-05 | global batch size: 2048 | lm loss: 5.208030E+00 | loss scale: 8192.0 | grad norm: 12973.484 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 704/ 292968 | consumed samples: 1441792 | consumed tokens: 114655232 | elapsed time per iteration (ms): 116327.0 | learning rate: 3.845E-05 | global batch size: 2048 | lm loss: 5.162397E+00 | loss scale: 8192.0 | grad norm: 10271.785 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 705/ 292968 | consumed samples: 1443840 | consumed tokens: 114851840 | elapsed time per iteration (ms): 111800.7 | learning rate: 3.850E-05 | global batch size: 2048 | lm loss: 5.168898E+00 | loss scale: 8192.0 | grad norm: 8225.549 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 706/ 292968 | consumed samples: 1445888 | consumed tokens: 115048448 | elapsed time per iteration (ms): 107866.4 | learning rate: 3.856E-05 | global batch size: 2048 | lm loss: 5.172147E+00 | loss scale: 8192.0 | grad norm: 13116.569 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 707/ 292968 | consumed samples: 1447936 | consumed tokens: 115245056 | elapsed time per iteration (ms): 110903.9 | learning rate: 3.861E-05 | global batch size: 2048 | lm loss: 5.175503E+00 | loss scale: 8192.0 | grad norm: 7329.200 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 708/ 292968 | consumed samples: 1449984 | consumed tokens: 115441664 | elapsed time per iteration (ms): 106484.3 | learning rate: 3.867E-05 | global batch size: 2048 | lm loss: 5.162799E+00 | loss scale: 8192.0 | grad norm: 12798.169 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 709/ 292968 | consumed samples: 1452032 | consumed tokens: 115638272 | elapsed time per iteration (ms): 106101.3 | learning rate: 3.872E-05 | global batch size: 2048 | lm loss: 5.125592E+00 | loss scale: 8192.0 | grad norm: 8775.719 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 710/ 292968 | consumed samples: 1454080 | consumed tokens: 115834880 | elapsed time per iteration (ms): 98922.9 | learning rate: 3.878E-05 | global batch size: 2048 | lm loss: 5.154107E+00 | loss scale: 8192.0 | grad norm: 8370.929 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 711/ 292968 | consumed samples: 1456128 | consumed tokens: 116031488 | elapsed time per iteration (ms): 100539.2 | learning rate: 3.883E-05 | global batch size: 2048 | lm loss: 5.188827E+00 | loss scale: 8192.0 | grad norm: 10170.930 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 712/ 292968 | consumed samples: 1458176 | consumed tokens: 116228096 | elapsed time per iteration (ms): 99293.3 | learning rate: 3.888E-05 | global batch size: 2048 | lm loss: 5.153638E+00 | loss scale: 8192.0 | grad norm: 9751.554 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 713/ 292968 | consumed samples: 1460224 | consumed tokens: 116424704 | elapsed time per iteration (ms): 97446.4 | learning rate: 3.894E-05 | global batch size: 2048 | lm loss: 5.185704E+00 | loss scale: 8192.0 | grad norm: 9467.768 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 714/ 292968 | consumed samples: 1462272 | consumed tokens: 116621312 | elapsed time per iteration (ms): 93499.1 | learning rate: 3.899E-05 | global batch size: 2048 | lm loss: 5.177588E+00 | loss scale: 8192.0 | grad norm: 11335.901 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 715/ 292968 | consumed samples: 1464320 | consumed tokens: 116817920 | elapsed time per iteration (ms): 94643.4 | learning rate: 3.905E-05 | global batch size: 2048 | lm loss: 5.185459E+00 | loss scale: 8192.0 | grad norm: 8536.241 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 716/ 292968 | consumed samples: 1466368 | consumed tokens: 117014528 | elapsed time per iteration (ms): 99892.8 | learning rate: 3.910E-05 | global batch size: 2048 | lm loss: 5.135908E+00 | loss scale: 8192.0 | grad norm: 6463.794 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 717/ 292968 | consumed samples: 1468416 | consumed tokens: 117211136 | elapsed time per iteration (ms): 104277.9 | learning rate: 3.916E-05 | global batch size: 2048 | lm loss: 5.151158E+00 | loss scale: 8192.0 | grad norm: 8612.554 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 718/ 292968 | consumed samples: 1470464 | consumed tokens: 117407744 | elapsed time per iteration (ms): 104436.5 | learning rate: 3.921E-05 | global batch size: 2048 | lm loss: 5.167432E+00 | loss scale: 8192.0 | grad norm: 9826.560 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 719/ 292968 | consumed samples: 1472512 | consumed tokens: 117604352 | elapsed time per iteration (ms): 100390.6 | learning rate: 3.927E-05 | global batch size: 2048 | lm loss: 5.134981E+00 | loss scale: 8192.0 | grad norm: 7153.312 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 720/ 292968 | consumed samples: 1474560 | consumed tokens: 117800960 | elapsed time per iteration (ms): 95315.0 | learning rate: 3.932E-05 | global batch size: 2048 | lm loss: 5.142948E+00 | loss scale: 8192.0 | grad norm: 8131.710 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 721/ 292968 | consumed samples: 1476608 | consumed tokens: 117997568 | elapsed time per iteration (ms): 95603.7 | learning rate: 3.938E-05 | global batch size: 2048 | lm loss: 5.147059E+00 | loss scale: 8192.0 | grad norm: 10278.883 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 722/ 292968 | consumed samples: 1478656 | consumed tokens: 118194176 | elapsed time per iteration (ms): 99206.1 | learning rate: 3.943E-05 | global batch size: 2048 | lm loss: 5.156811E+00 | loss scale: 8192.0 | grad norm: 10296.426 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 723/ 292968 | consumed samples: 1480704 | consumed tokens: 118390784 | elapsed time per iteration (ms): 96657.7 | learning rate: 3.949E-05 | global batch size: 2048 | lm loss: 5.142353E+00 | loss scale: 8192.0 | grad norm: 11038.499 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 724/ 292968 | consumed samples: 1482752 | consumed tokens: 118587392 | elapsed time per iteration (ms): 97978.9 | learning rate: 3.954E-05 | global batch size: 2048 | lm loss: 5.136504E+00 | loss scale: 8192.0 | grad norm: 8216.465 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 725/ 292968 | consumed samples: 1484800 | consumed tokens: 118784000 | elapsed time per iteration (ms): 98499.0 | learning rate: 3.959E-05 | global batch size: 2048 | lm loss: 5.102932E+00 | loss scale: 8192.0 | grad norm: 12253.114 | num zeros: 0.0 | curriculum seqlen: 96 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 726/ 292968 | consumed samples: 1486848 | consumed tokens: 118996992 | elapsed time per iteration (ms): 111151.2 | learning rate: 3.965E-05 | global batch size: 2048 | lm loss: 5.187205E+00 | loss scale: 8192.0 | grad norm: 10403.797 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 727/ 292968 | consumed samples: 1488896 | consumed tokens: 119209984 | elapsed time per iteration (ms): 103481.5 | learning rate: 3.970E-05 | global batch size: 2048 | lm loss: 5.237492E+00 | loss scale: 8192.0 | grad norm: 14848.460 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 728/ 292968 | consumed samples: 1490944 | consumed tokens: 119422976 | elapsed time per iteration (ms): 101792.4 | learning rate: 3.976E-05 | global batch size: 2048 | lm loss: 5.221199E+00 | loss scale: 8192.0 | grad norm: 11316.264 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 729/ 292968 | consumed samples: 1492992 | consumed tokens: 119635968 | elapsed time per iteration (ms): 96228.0 | learning rate: 3.981E-05 | global batch size: 2048 | lm loss: 5.204230E+00 | loss scale: 8192.0 | grad norm: 9812.354 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 730/ 292968 | consumed samples: 1495040 | consumed tokens: 119848960 | elapsed time per iteration (ms): 87336.0 | learning rate: 3.987E-05 | global batch size: 2048 | lm loss: 5.197078E+00 | loss scale: 8192.0 | grad norm: 10784.331 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 731/ 292968 | consumed samples: 1497088 | consumed tokens: 120061952 | elapsed time per iteration (ms): 84915.5 | learning rate: 3.992E-05 | global batch size: 2048 | lm loss: 5.237545E+00 | loss scale: 8192.0 | grad norm: 11289.078 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 732/ 292968 | consumed samples: 1499136 | consumed tokens: 120274944 | elapsed time per iteration (ms): 93780.6 | learning rate: 3.998E-05 | global batch size: 2048 | lm loss: 5.172385E+00 | loss scale: 8192.0 | grad norm: 11504.190 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 733/ 292968 | consumed samples: 1501184 | consumed tokens: 120487936 | elapsed time per iteration (ms): 100036.0 | learning rate: 4.003E-05 | global batch size: 2048 | lm loss: 5.170466E+00 | loss scale: 8192.0 | grad norm: 9167.582 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 734/ 292968 | consumed samples: 1503232 | consumed tokens: 120700928 | elapsed time per iteration (ms): 96002.5 | learning rate: 4.009E-05 | global batch size: 2048 | lm loss: 5.182973E+00 | loss scale: 8192.0 | grad norm: 13538.983 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 735/ 292968 | consumed samples: 1505280 | consumed tokens: 120913920 | elapsed time per iteration (ms): 100550.0 | learning rate: 4.014E-05 | global batch size: 2048 | lm loss: 5.173321E+00 | loss scale: 8192.0 | grad norm: 10428.290 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 736/ 292968 | consumed samples: 1507328 | consumed tokens: 121126912 | elapsed time per iteration (ms): 99729.1 | learning rate: 4.020E-05 | global batch size: 2048 | lm loss: 5.158158E+00 | loss scale: 8192.0 | grad norm: 9562.448 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 737/ 292968 | consumed samples: 1509376 | consumed tokens: 121339904 | elapsed time per iteration (ms): 93444.2 | learning rate: 4.025E-05 | global batch size: 2048 | lm loss: 5.145337E+00 | loss scale: 8192.0 | grad norm: 8311.052 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 738/ 292968 | consumed samples: 1511424 | consumed tokens: 121552896 | elapsed time per iteration (ms): 92721.7 | learning rate: 4.030E-05 | global batch size: 2048 | lm loss: 5.145213E+00 | loss scale: 8192.0 | grad norm: 8964.069 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 739/ 292968 | consumed samples: 1513472 | consumed tokens: 121765888 | elapsed time per iteration (ms): 100955.4 | learning rate: 4.036E-05 | global batch size: 2048 | lm loss: 5.163105E+00 | loss scale: 8192.0 | grad norm: 12912.475 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 740/ 292968 | consumed samples: 1515520 | consumed tokens: 121978880 | elapsed time per iteration (ms): 99270.8 | learning rate: 4.041E-05 | global batch size: 2048 | lm loss: 5.160538E+00 | loss scale: 8192.0 | grad norm: 9533.689 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 741/ 292968 | consumed samples: 1517568 | consumed tokens: 122191872 | elapsed time per iteration (ms): 94688.6 | learning rate: 4.047E-05 | global batch size: 2048 | lm loss: 5.135939E+00 | loss scale: 8192.0 | grad norm: 9593.962 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 742/ 292968 | consumed samples: 1519616 | consumed tokens: 122404864 | elapsed time per iteration (ms): 102639.8 | learning rate: 4.052E-05 | global batch size: 2048 | lm loss: 5.130993E+00 | loss scale: 8192.0 | grad norm: 8530.196 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 743/ 292968 | consumed samples: 1521664 | consumed tokens: 122617856 | elapsed time per iteration (ms): 101938.9 | learning rate: 4.058E-05 | global batch size: 2048 | lm loss: 5.155418E+00 | loss scale: 8192.0 | grad norm: 14707.646 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 744/ 292968 | consumed samples: 1523712 | consumed tokens: 122830848 | elapsed time per iteration (ms): 95242.7 | learning rate: 4.063E-05 | global batch size: 2048 | lm loss: 5.123902E+00 | loss scale: 8192.0 | grad norm: 8235.325 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 745/ 292968 | consumed samples: 1525760 | consumed tokens: 123043840 | elapsed time per iteration (ms): 93999.9 | learning rate: 4.069E-05 | global batch size: 2048 | lm loss: 5.147910E+00 | loss scale: 8192.0 | grad norm: 9563.614 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 746/ 292968 | consumed samples: 1527808 | consumed tokens: 123256832 | elapsed time per iteration (ms): 95446.5 | learning rate: 4.074E-05 | global batch size: 2048 | lm loss: 5.089044E+00 | loss scale: 8192.0 | grad norm: 10209.814 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 747/ 292968 | consumed samples: 1529856 | consumed tokens: 123469824 | elapsed time per iteration (ms): 97706.0 | learning rate: 4.080E-05 | global batch size: 2048 | lm loss: 5.123481E+00 | loss scale: 8192.0 | grad norm: 9577.369 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 748/ 292968 | consumed samples: 1531904 | consumed tokens: 123682816 | elapsed time per iteration (ms): 96658.4 | learning rate: 4.085E-05 | global batch size: 2048 | lm loss: 5.084899E+00 | loss scale: 8192.0 | grad norm: 9740.223 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 749/ 292968 | consumed samples: 1533952 | consumed tokens: 123895808 | elapsed time per iteration (ms): 96157.2 | learning rate: 4.091E-05 | global batch size: 2048 | lm loss: 5.111638E+00 | loss scale: 8192.0 | grad norm: 9398.744 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 750/ 292968 | consumed samples: 1536000 | consumed tokens: 124108800 | elapsed time per iteration (ms): 94564.0 | learning rate: 4.096E-05 | global batch size: 2048 | lm loss: 5.120895E+00 | loss scale: 8192.0 | grad norm: 7518.660 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------ - validation loss at iteration 750 | lm loss value: 5.068503E+00 | lm loss PPL: 1.589363E+02 | ------------------------------------------------------------------------------------------------ - iteration 751/ 292968 | consumed samples: 1538048 | consumed tokens: 124321792 | elapsed time per iteration (ms): 333141.4 | learning rate: 4.101E-05 | global batch size: 2048 | lm loss: 5.079378E+00 | loss scale: 8192.0 | grad norm: 7726.654 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 752/ 292968 | consumed samples: 1540096 | consumed tokens: 124534784 | elapsed time per iteration (ms): 102233.2 | learning rate: 4.107E-05 | global batch size: 2048 | lm loss: 5.096935E+00 | loss scale: 8192.0 | grad norm: 9254.879 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 753/ 292968 | consumed samples: 1542144 | consumed tokens: 124747776 | elapsed time per iteration (ms): 101243.5 | learning rate: 4.112E-05 | global batch size: 2048 | lm loss: 5.097287E+00 | loss scale: 8192.0 | grad norm: 8846.072 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 754/ 292968 | consumed samples: 1544192 | consumed tokens: 124960768 | elapsed time per iteration (ms): 99110.0 | learning rate: 4.118E-05 | global batch size: 2048 | lm loss: 5.078513E+00 | loss scale: 8192.0 | grad norm: 9823.396 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 755/ 292968 | consumed samples: 1546240 | consumed tokens: 125173760 | elapsed time per iteration (ms): 100200.1 | learning rate: 4.123E-05 | global batch size: 2048 | lm loss: 5.094606E+00 | loss scale: 8192.0 | grad norm: 8532.593 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 756/ 292968 | consumed samples: 1548288 | consumed tokens: 125386752 | elapsed time per iteration (ms): 111321.6 | learning rate: 4.129E-05 | global batch size: 2048 | lm loss: 5.062562E+00 | loss scale: 8192.0 | grad norm: 8071.326 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 757/ 292968 | consumed samples: 1550336 | consumed tokens: 125599744 | elapsed time per iteration (ms): 109875.2 | learning rate: 4.134E-05 | global batch size: 2048 | lm loss: 5.075614E+00 | loss scale: 8192.0 | grad norm: 12356.039 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 758/ 292968 | consumed samples: 1552384 | consumed tokens: 125812736 | elapsed time per iteration (ms): 97843.2 | learning rate: 4.140E-05 | global batch size: 2048 | lm loss: 5.081157E+00 | loss scale: 8192.0 | grad norm: 8401.689 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 759/ 292968 | consumed samples: 1554432 | consumed tokens: 126025728 | elapsed time per iteration (ms): 90172.0 | learning rate: 4.145E-05 | global batch size: 2048 | lm loss: 5.057127E+00 | loss scale: 8192.0 | grad norm: 8431.774 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 760/ 292968 | consumed samples: 1556480 | consumed tokens: 126238720 | elapsed time per iteration (ms): 87425.0 | learning rate: 4.151E-05 | global batch size: 2048 | lm loss: 5.069711E+00 | loss scale: 8192.0 | grad norm: 11070.843 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 761/ 292968 | consumed samples: 1558528 | consumed tokens: 126451712 | elapsed time per iteration (ms): 89045.4 | learning rate: 4.156E-05 | global batch size: 2048 | lm loss: 5.049534E+00 | loss scale: 8192.0 | grad norm: 7144.952 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 762/ 292968 | consumed samples: 1560576 | consumed tokens: 126664704 | elapsed time per iteration (ms): 83755.2 | learning rate: 4.162E-05 | global batch size: 2048 | lm loss: 5.069824E+00 | loss scale: 8192.0 | grad norm: 8186.557 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 763/ 292968 | consumed samples: 1562624 | consumed tokens: 126877696 | elapsed time per iteration (ms): 87031.3 | learning rate: 4.167E-05 | global batch size: 2048 | lm loss: 5.070203E+00 | loss scale: 8192.0 | grad norm: 10249.618 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 764/ 292968 | consumed samples: 1564672 | consumed tokens: 127090688 | elapsed time per iteration (ms): 96323.9 | learning rate: 4.172E-05 | global batch size: 2048 | lm loss: 5.041795E+00 | loss scale: 8192.0 | grad norm: 6110.238 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 765/ 292968 | consumed samples: 1566720 | consumed tokens: 127303680 | elapsed time per iteration (ms): 95407.6 | learning rate: 4.178E-05 | global batch size: 2048 | lm loss: 5.050972E+00 | loss scale: 8192.0 | grad norm: 6942.078 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 766/ 292968 | consumed samples: 1568768 | consumed tokens: 127516672 | elapsed time per iteration (ms): 92066.5 | learning rate: 4.183E-05 | global batch size: 2048 | lm loss: 5.050848E+00 | loss scale: 8192.0 | grad norm: 8828.824 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 767/ 292968 | consumed samples: 1570816 | consumed tokens: 127729664 | elapsed time per iteration (ms): 86795.1 | learning rate: 4.189E-05 | global batch size: 2048 | lm loss: 5.024844E+00 | loss scale: 8192.0 | grad norm: 9494.234 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 768/ 292968 | consumed samples: 1572864 | consumed tokens: 127942656 | elapsed time per iteration (ms): 84596.0 | learning rate: 4.194E-05 | global batch size: 2048 | lm loss: 5.050458E+00 | loss scale: 8192.0 | grad norm: 6947.254 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 769/ 292968 | consumed samples: 1574912 | consumed tokens: 128155648 | elapsed time per iteration (ms): 82331.4 | learning rate: 4.200E-05 | global batch size: 2048 | lm loss: 5.079420E+00 | loss scale: 8192.0 | grad norm: 9553.482 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 770/ 292968 | consumed samples: 1576960 | consumed tokens: 128368640 | elapsed time per iteration (ms): 91135.2 | learning rate: 4.205E-05 | global batch size: 2048 | lm loss: 5.038568E+00 | loss scale: 8192.0 | grad norm: 9302.073 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 771/ 292968 | consumed samples: 1579008 | consumed tokens: 128581632 | elapsed time per iteration (ms): 108818.0 | learning rate: 4.211E-05 | global batch size: 2048 | lm loss: 5.012247E+00 | loss scale: 8192.0 | grad norm: 10569.150 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 772/ 292968 | consumed samples: 1581056 | consumed tokens: 128794624 | elapsed time per iteration (ms): 114783.3 | learning rate: 4.216E-05 | global batch size: 2048 | lm loss: 5.053435E+00 | loss scale: 8192.0 | grad norm: 11083.778 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 773/ 292968 | consumed samples: 1583104 | consumed tokens: 129007616 | elapsed time per iteration (ms): 100264.0 | learning rate: 4.222E-05 | global batch size: 2048 | lm loss: 5.010720E+00 | loss scale: 8192.0 | grad norm: 7078.107 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 774/ 292968 | consumed samples: 1585152 | consumed tokens: 129220608 | elapsed time per iteration (ms): 95824.2 | learning rate: 4.227E-05 | global batch size: 2048 | lm loss: 5.013454E+00 | loss scale: 8192.0 | grad norm: 8401.244 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 775/ 292968 | consumed samples: 1587200 | consumed tokens: 129433600 | elapsed time per iteration (ms): 94123.9 | learning rate: 4.233E-05 | global batch size: 2048 | lm loss: 5.009838E+00 | loss scale: 8192.0 | grad norm: 9617.729 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 776/ 292968 | consumed samples: 1589248 | consumed tokens: 129646592 | elapsed time per iteration (ms): 89112.5 | learning rate: 4.238E-05 | global batch size: 2048 | lm loss: 5.017678E+00 | loss scale: 8192.0 | grad norm: 9007.882 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 777/ 292968 | consumed samples: 1591296 | consumed tokens: 129859584 | elapsed time per iteration (ms): 92165.9 | learning rate: 4.243E-05 | global batch size: 2048 | lm loss: 5.033987E+00 | loss scale: 8192.0 | grad norm: 9608.444 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 778/ 292968 | consumed samples: 1593344 | consumed tokens: 130072576 | elapsed time per iteration (ms): 101065.4 | learning rate: 4.249E-05 | global batch size: 2048 | lm loss: 5.002667E+00 | loss scale: 8192.0 | grad norm: 7645.585 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 779/ 292968 | consumed samples: 1595392 | consumed tokens: 130285568 | elapsed time per iteration (ms): 103886.2 | learning rate: 4.254E-05 | global batch size: 2048 | lm loss: 5.009189E+00 | loss scale: 8192.0 | grad norm: 10778.665 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 780/ 292968 | consumed samples: 1597440 | consumed tokens: 130498560 | elapsed time per iteration (ms): 108909.3 | learning rate: 4.260E-05 | global batch size: 2048 | lm loss: 4.980504E+00 | loss scale: 8192.0 | grad norm: 9767.510 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 781/ 292968 | consumed samples: 1599488 | consumed tokens: 130711552 | elapsed time per iteration (ms): 104478.6 | learning rate: 4.265E-05 | global batch size: 2048 | lm loss: 4.996379E+00 | loss scale: 8192.0 | grad norm: 7660.113 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 782/ 292968 | consumed samples: 1601536 | consumed tokens: 130924544 | elapsed time per iteration (ms): 91664.0 | learning rate: 4.271E-05 | global batch size: 2048 | lm loss: 5.040724E+00 | loss scale: 8192.0 | grad norm: 9442.212 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 783/ 292968 | consumed samples: 1603584 | consumed tokens: 131137536 | elapsed time per iteration (ms): 91019.4 | learning rate: 4.276E-05 | global batch size: 2048 | lm loss: 5.017748E+00 | loss scale: 8192.0 | grad norm: 8891.952 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 784/ 292968 | consumed samples: 1605632 | consumed tokens: 131350528 | elapsed time per iteration (ms): 95055.7 | learning rate: 4.282E-05 | global batch size: 2048 | lm loss: 5.025961E+00 | loss scale: 8192.0 | grad norm: 9335.834 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 785/ 292968 | consumed samples: 1607680 | consumed tokens: 131563520 | elapsed time per iteration (ms): 94297.1 | learning rate: 4.287E-05 | global batch size: 2048 | lm loss: 5.013981E+00 | loss scale: 8192.0 | grad norm: 8125.859 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 786/ 292968 | consumed samples: 1609728 | consumed tokens: 131776512 | elapsed time per iteration (ms): 92944.1 | learning rate: 4.293E-05 | global batch size: 2048 | lm loss: 5.034190E+00 | loss scale: 8192.0 | grad norm: 9627.790 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 787/ 292968 | consumed samples: 1611776 | consumed tokens: 131989504 | elapsed time per iteration (ms): 85550.5 | learning rate: 4.298E-05 | global batch size: 2048 | lm loss: 4.999897E+00 | loss scale: 8192.0 | grad norm: 9882.803 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 788/ 292968 | consumed samples: 1613824 | consumed tokens: 132202496 | elapsed time per iteration (ms): 87289.0 | learning rate: 4.304E-05 | global batch size: 2048 | lm loss: 4.983741E+00 | loss scale: 8192.0 | grad norm: 7437.587 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 789/ 292968 | consumed samples: 1615872 | consumed tokens: 132415488 | elapsed time per iteration (ms): 81611.2 | learning rate: 4.309E-05 | global batch size: 2048 | lm loss: 4.970300E+00 | loss scale: 8192.0 | grad norm: 8953.533 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 790/ 292968 | consumed samples: 1617920 | consumed tokens: 132628480 | elapsed time per iteration (ms): 88407.9 | learning rate: 4.314E-05 | global batch size: 2048 | lm loss: 4.995797E+00 | loss scale: 8192.0 | grad norm: 9455.538 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 791/ 292968 | consumed samples: 1619968 | consumed tokens: 132841472 | elapsed time per iteration (ms): 89986.6 | learning rate: 4.320E-05 | global batch size: 2048 | lm loss: 4.990129E+00 | loss scale: 8192.0 | grad norm: 8610.979 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 792/ 292968 | consumed samples: 1622016 | consumed tokens: 133054464 | elapsed time per iteration (ms): 92048.7 | learning rate: 4.325E-05 | global batch size: 2048 | lm loss: 4.980020E+00 | loss scale: 8192.0 | grad norm: 9159.021 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 793/ 292968 | consumed samples: 1624064 | consumed tokens: 133267456 | elapsed time per iteration (ms): 94085.7 | learning rate: 4.331E-05 | global batch size: 2048 | lm loss: 4.996900E+00 | loss scale: 8192.0 | grad norm: 7882.973 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 794/ 292968 | consumed samples: 1626112 | consumed tokens: 133480448 | elapsed time per iteration (ms): 94716.9 | learning rate: 4.336E-05 | global batch size: 2048 | lm loss: 5.017018E+00 | loss scale: 8192.0 | grad norm: 9046.810 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 795/ 292968 | consumed samples: 1628160 | consumed tokens: 133693440 | elapsed time per iteration (ms): 96610.2 | learning rate: 4.342E-05 | global batch size: 2048 | lm loss: 4.964896E+00 | loss scale: 8192.0 | grad norm: 10167.842 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 796/ 292968 | consumed samples: 1630208 | consumed tokens: 133906432 | elapsed time per iteration (ms): 96272.9 | learning rate: 4.347E-05 | global batch size: 2048 | lm loss: 4.980704E+00 | loss scale: 8192.0 | grad norm: 8754.157 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 797/ 292968 | consumed samples: 1632256 | consumed tokens: 134119424 | elapsed time per iteration (ms): 90417.5 | learning rate: 4.353E-05 | global batch size: 2048 | lm loss: 4.974670E+00 | loss scale: 8192.0 | grad norm: 8083.428 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 798/ 292968 | consumed samples: 1634304 | consumed tokens: 134332416 | elapsed time per iteration (ms): 85641.5 | learning rate: 4.358E-05 | global batch size: 2048 | lm loss: 4.956146E+00 | loss scale: 8192.0 | grad norm: 8358.883 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 799/ 292968 | consumed samples: 1636352 | consumed tokens: 134545408 | elapsed time per iteration (ms): 94590.7 | learning rate: 4.364E-05 | global batch size: 2048 | lm loss: 4.992686E+00 | loss scale: 8192.0 | grad norm: 8957.439 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 800/ 292968 | consumed samples: 1638400 | consumed tokens: 134758400 | elapsed time per iteration (ms): 112526.4 | learning rate: 4.369E-05 | global batch size: 2048 | lm loss: 4.980062E+00 | loss scale: 8192.0 | grad norm: 9224.950 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 801/ 292968 | consumed samples: 1640448 | consumed tokens: 134971392 | elapsed time per iteration (ms): 100262.8 | learning rate: 4.375E-05 | global batch size: 2048 | lm loss: 4.970032E+00 | loss scale: 8192.0 | grad norm: 10198.952 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 802/ 292968 | consumed samples: 1642496 | consumed tokens: 135184384 | elapsed time per iteration (ms): 91739.6 | learning rate: 4.380E-05 | global batch size: 2048 | lm loss: 4.931866E+00 | loss scale: 8192.0 | grad norm: 6971.804 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 803/ 292968 | consumed samples: 1644544 | consumed tokens: 135397376 | elapsed time per iteration (ms): 86653.8 | learning rate: 4.385E-05 | global batch size: 2048 | lm loss: 5.001899E+00 | loss scale: 8192.0 | grad norm: 8944.889 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 804/ 292968 | consumed samples: 1646592 | consumed tokens: 135610368 | elapsed time per iteration (ms): 84867.9 | learning rate: 4.391E-05 | global batch size: 2048 | lm loss: 5.002703E+00 | loss scale: 8192.0 | grad norm: 9886.276 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 805/ 292968 | consumed samples: 1648640 | consumed tokens: 135823360 | elapsed time per iteration (ms): 81891.9 | learning rate: 4.396E-05 | global batch size: 2048 | lm loss: 4.985003E+00 | loss scale: 8192.0 | grad norm: 9015.515 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 806/ 292968 | consumed samples: 1650688 | consumed tokens: 136036352 | elapsed time per iteration (ms): 85338.9 | learning rate: 4.402E-05 | global batch size: 2048 | lm loss: 4.967111E+00 | loss scale: 8192.0 | grad norm: 8968.275 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 807/ 292968 | consumed samples: 1652736 | consumed tokens: 136249344 | elapsed time per iteration (ms): 92816.3 | learning rate: 4.407E-05 | global batch size: 2048 | lm loss: 4.965900E+00 | loss scale: 8192.0 | grad norm: 8741.400 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 808/ 292968 | consumed samples: 1654784 | consumed tokens: 136462336 | elapsed time per iteration (ms): 92602.7 | learning rate: 4.413E-05 | global batch size: 2048 | lm loss: 4.950453E+00 | loss scale: 8192.0 | grad norm: 8776.107 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 809/ 292968 | consumed samples: 1656832 | consumed tokens: 136675328 | elapsed time per iteration (ms): 88386.7 | learning rate: 4.418E-05 | global batch size: 2048 | lm loss: 4.991675E+00 | loss scale: 8192.0 | grad norm: 9313.477 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 810/ 292968 | consumed samples: 1658880 | consumed tokens: 136888320 | elapsed time per iteration (ms): 83652.9 | learning rate: 4.424E-05 | global batch size: 2048 | lm loss: 4.956954E+00 | loss scale: 8192.0 | grad norm: 7602.587 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 811/ 292968 | consumed samples: 1660928 | consumed tokens: 137101312 | elapsed time per iteration (ms): 85518.2 | learning rate: 4.429E-05 | global batch size: 2048 | lm loss: 4.955671E+00 | loss scale: 8192.0 | grad norm: 8268.537 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 812/ 292968 | consumed samples: 1662976 | consumed tokens: 137314304 | elapsed time per iteration (ms): 83330.1 | learning rate: 4.435E-05 | global batch size: 2048 | lm loss: 4.940743E+00 | loss scale: 8192.0 | grad norm: 8706.922 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 813/ 292968 | consumed samples: 1665024 | consumed tokens: 137527296 | elapsed time per iteration (ms): 80130.6 | learning rate: 4.440E-05 | global batch size: 2048 | lm loss: 4.934225E+00 | loss scale: 8192.0 | grad norm: 8743.773 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 814/ 292968 | consumed samples: 1667072 | consumed tokens: 137740288 | elapsed time per iteration (ms): 86813.1 | learning rate: 4.446E-05 | global batch size: 2048 | lm loss: 4.949559E+00 | loss scale: 8192.0 | grad norm: 8388.369 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 815/ 292968 | consumed samples: 1669120 | consumed tokens: 137953280 | elapsed time per iteration (ms): 89539.9 | learning rate: 4.451E-05 | global batch size: 2048 | lm loss: 4.965991E+00 | loss scale: 8192.0 | grad norm: 9445.282 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 816/ 292968 | consumed samples: 1671168 | consumed tokens: 138166272 | elapsed time per iteration (ms): 88506.4 | learning rate: 4.456E-05 | global batch size: 2048 | lm loss: 4.950090E+00 | loss scale: 8192.0 | grad norm: 10925.595 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 817/ 292968 | consumed samples: 1673216 | consumed tokens: 138379264 | elapsed time per iteration (ms): 90316.5 | learning rate: 4.462E-05 | global batch size: 2048 | lm loss: 4.970661E+00 | loss scale: 8192.0 | grad norm: 7185.283 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 818/ 292968 | consumed samples: 1675264 | consumed tokens: 138592256 | elapsed time per iteration (ms): 92040.1 | learning rate: 4.467E-05 | global batch size: 2048 | lm loss: 4.979756E+00 | loss scale: 8192.0 | grad norm: 9220.821 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 819/ 292968 | consumed samples: 1677312 | consumed tokens: 138805248 | elapsed time per iteration (ms): 94418.9 | learning rate: 4.473E-05 | global batch size: 2048 | lm loss: 4.949591E+00 | loss scale: 8192.0 | grad norm: 8817.630 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 820/ 292968 | consumed samples: 1679360 | consumed tokens: 139018240 | elapsed time per iteration (ms): 90756.1 | learning rate: 4.478E-05 | global batch size: 2048 | lm loss: 4.935697E+00 | loss scale: 8192.0 | grad norm: 8306.430 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 821/ 292968 | consumed samples: 1681408 | consumed tokens: 139231232 | elapsed time per iteration (ms): 87975.8 | learning rate: 4.484E-05 | global batch size: 2048 | lm loss: 4.940872E+00 | loss scale: 8192.0 | grad norm: 7791.004 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 822/ 292968 | consumed samples: 1683456 | consumed tokens: 139444224 | elapsed time per iteration (ms): 98225.4 | learning rate: 4.489E-05 | global batch size: 2048 | lm loss: 4.946635E+00 | loss scale: 8192.0 | grad norm: 6264.309 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 823/ 292968 | consumed samples: 1685504 | consumed tokens: 139657216 | elapsed time per iteration (ms): 94571.3 | learning rate: 4.495E-05 | global batch size: 2048 | lm loss: 4.897384E+00 | loss scale: 8192.0 | grad norm: 6329.339 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 824/ 292968 | consumed samples: 1687552 | consumed tokens: 139870208 | elapsed time per iteration (ms): 93375.9 | learning rate: 4.500E-05 | global batch size: 2048 | lm loss: 4.933838E+00 | loss scale: 8192.0 | grad norm: 6873.402 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 825/ 292968 | consumed samples: 1689600 | consumed tokens: 140083200 | elapsed time per iteration (ms): 84405.2 | learning rate: 4.506E-05 | global batch size: 2048 | lm loss: 4.940725E+00 | loss scale: 8192.0 | grad norm: 8215.687 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 826/ 292968 | consumed samples: 1691648 | consumed tokens: 140296192 | elapsed time per iteration (ms): 86587.0 | learning rate: 4.511E-05 | global batch size: 2048 | lm loss: 4.924040E+00 | loss scale: 8192.0 | grad norm: 9743.582 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 827/ 292968 | consumed samples: 1693696 | consumed tokens: 140509184 | elapsed time per iteration (ms): 81518.6 | learning rate: 4.517E-05 | global batch size: 2048 | lm loss: 4.931610E+00 | loss scale: 8192.0 | grad norm: 10199.890 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 828/ 292968 | consumed samples: 1695744 | consumed tokens: 140722176 | elapsed time per iteration (ms): 84996.5 | learning rate: 4.522E-05 | global batch size: 2048 | lm loss: 4.906430E+00 | loss scale: 8192.0 | grad norm: 7666.318 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 829/ 292968 | consumed samples: 1697792 | consumed tokens: 140935168 | elapsed time per iteration (ms): 90229.1 | learning rate: 4.527E-05 | global batch size: 2048 | lm loss: 4.939106E+00 | loss scale: 8192.0 | grad norm: 8603.777 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 830/ 292968 | consumed samples: 1699840 | consumed tokens: 141148160 | elapsed time per iteration (ms): 93250.0 | learning rate: 4.533E-05 | global batch size: 2048 | lm loss: 4.908719E+00 | loss scale: 8192.0 | grad norm: 9286.576 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 831/ 292968 | consumed samples: 1701888 | consumed tokens: 141361152 | elapsed time per iteration (ms): 91608.7 | learning rate: 4.538E-05 | global batch size: 2048 | lm loss: 4.922731E+00 | loss scale: 8192.0 | grad norm: 7918.632 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 832/ 292968 | consumed samples: 1703936 | consumed tokens: 141574144 | elapsed time per iteration (ms): 86694.8 | learning rate: 4.544E-05 | global batch size: 2048 | lm loss: 4.898895E+00 | loss scale: 8192.0 | grad norm: 8033.319 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 833/ 292968 | consumed samples: 1705984 | consumed tokens: 141787136 | elapsed time per iteration (ms): 85204.7 | learning rate: 4.549E-05 | global batch size: 2048 | lm loss: 4.917194E+00 | loss scale: 8192.0 | grad norm: 10834.592 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 834/ 292968 | consumed samples: 1708032 | consumed tokens: 142000128 | elapsed time per iteration (ms): 81634.2 | learning rate: 4.555E-05 | global batch size: 2048 | lm loss: 4.922104E+00 | loss scale: 8192.0 | grad norm: 10094.288 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 835/ 292968 | consumed samples: 1710080 | consumed tokens: 142213120 | elapsed time per iteration (ms): 84097.7 | learning rate: 4.560E-05 | global batch size: 2048 | lm loss: 4.917187E+00 | loss scale: 8192.0 | grad norm: 7270.369 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 836/ 292968 | consumed samples: 1712128 | consumed tokens: 142426112 | elapsed time per iteration (ms): 87917.1 | learning rate: 4.566E-05 | global batch size: 2048 | lm loss: 4.902526E+00 | loss scale: 8192.0 | grad norm: 6836.908 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 837/ 292968 | consumed samples: 1714176 | consumed tokens: 142639104 | elapsed time per iteration (ms): 100500.4 | learning rate: 4.571E-05 | global batch size: 2048 | lm loss: 4.897984E+00 | loss scale: 8192.0 | grad norm: 7481.614 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 838/ 292968 | consumed samples: 1716224 | consumed tokens: 142852096 | elapsed time per iteration (ms): 102112.7 | learning rate: 4.577E-05 | global batch size: 2048 | lm loss: 4.925521E+00 | loss scale: 8192.0 | grad norm: 7310.433 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 839/ 292968 | consumed samples: 1718272 | consumed tokens: 143065088 | elapsed time per iteration (ms): 99107.8 | learning rate: 4.582E-05 | global batch size: 2048 | lm loss: 4.924427E+00 | loss scale: 8192.0 | grad norm: 11633.882 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 840/ 292968 | consumed samples: 1720320 | consumed tokens: 143278080 | elapsed time per iteration (ms): 90866.9 | learning rate: 4.588E-05 | global batch size: 2048 | lm loss: 4.868423E+00 | loss scale: 8192.0 | grad norm: 9305.986 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 841/ 292968 | consumed samples: 1722368 | consumed tokens: 143491072 | elapsed time per iteration (ms): 83759.4 | learning rate: 4.593E-05 | global batch size: 2048 | lm loss: 4.898633E+00 | loss scale: 8192.0 | grad norm: 7195.413 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 842/ 292968 | consumed samples: 1724416 | consumed tokens: 143704064 | elapsed time per iteration (ms): 85648.3 | learning rate: 4.598E-05 | global batch size: 2048 | lm loss: 4.921449E+00 | loss scale: 8192.0 | grad norm: 9566.656 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 843/ 292968 | consumed samples: 1726464 | consumed tokens: 143917056 | elapsed time per iteration (ms): 84190.8 | learning rate: 4.604E-05 | global batch size: 2048 | lm loss: 4.900602E+00 | loss scale: 8192.0 | grad norm: 10408.447 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 844/ 292968 | consumed samples: 1728512 | consumed tokens: 144130048 | elapsed time per iteration (ms): 91602.2 | learning rate: 4.609E-05 | global batch size: 2048 | lm loss: 4.890003E+00 | loss scale: 8192.0 | grad norm: 8738.267 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 845/ 292968 | consumed samples: 1730560 | consumed tokens: 144343040 | elapsed time per iteration (ms): 103003.9 | learning rate: 4.615E-05 | global batch size: 2048 | lm loss: 4.887909E+00 | loss scale: 8192.0 | grad norm: 8903.043 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 846/ 292968 | consumed samples: 1732608 | consumed tokens: 144556032 | elapsed time per iteration (ms): 102448.8 | learning rate: 4.620E-05 | global batch size: 2048 | lm loss: 4.901354E+00 | loss scale: 8192.0 | grad norm: 8394.797 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 847/ 292968 | consumed samples: 1734656 | consumed tokens: 144769024 | elapsed time per iteration (ms): 92334.8 | learning rate: 4.626E-05 | global batch size: 2048 | lm loss: 4.864662E+00 | loss scale: 8192.0 | grad norm: 7321.009 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 848/ 292968 | consumed samples: 1736704 | consumed tokens: 144982016 | elapsed time per iteration (ms): 90878.0 | learning rate: 4.631E-05 | global batch size: 2048 | lm loss: 4.916307E+00 | loss scale: 8192.0 | grad norm: 5756.623 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 849/ 292968 | consumed samples: 1738752 | consumed tokens: 145195008 | elapsed time per iteration (ms): 89266.6 | learning rate: 4.637E-05 | global batch size: 2048 | lm loss: 4.855122E+00 | loss scale: 8192.0 | grad norm: 9582.732 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 850/ 292968 | consumed samples: 1740800 | consumed tokens: 145408000 | elapsed time per iteration (ms): 96327.2 | learning rate: 4.642E-05 | global batch size: 2048 | lm loss: 4.892194E+00 | loss scale: 8192.0 | grad norm: 9798.677 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 851/ 292968 | consumed samples: 1742848 | consumed tokens: 145620992 | elapsed time per iteration (ms): 97683.9 | learning rate: 4.648E-05 | global batch size: 2048 | lm loss: 4.890501E+00 | loss scale: 8192.0 | grad norm: 8247.303 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 852/ 292968 | consumed samples: 1744896 | consumed tokens: 145833984 | elapsed time per iteration (ms): 95001.0 | learning rate: 4.653E-05 | global batch size: 2048 | lm loss: 4.879304E+00 | loss scale: 8192.0 | grad norm: 7524.410 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 853/ 292968 | consumed samples: 1746944 | consumed tokens: 146046976 | elapsed time per iteration (ms): 96419.5 | learning rate: 4.659E-05 | global batch size: 2048 | lm loss: 4.880531E+00 | loss scale: 8192.0 | grad norm: 6292.680 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 854/ 292968 | consumed samples: 1748992 | consumed tokens: 146259968 | elapsed time per iteration (ms): 93423.5 | learning rate: 4.664E-05 | global batch size: 2048 | lm loss: 4.885491E+00 | loss scale: 8192.0 | grad norm: 6244.983 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 855/ 292968 | consumed samples: 1751040 | consumed tokens: 146472960 | elapsed time per iteration (ms): 96752.8 | learning rate: 4.669E-05 | global batch size: 2048 | lm loss: 4.879394E+00 | loss scale: 8192.0 | grad norm: 8094.707 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 856/ 292968 | consumed samples: 1753088 | consumed tokens: 146685952 | elapsed time per iteration (ms): 98609.6 | learning rate: 4.675E-05 | global batch size: 2048 | lm loss: 4.897543E+00 | loss scale: 8192.0 | grad norm: 10528.108 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 857/ 292968 | consumed samples: 1755136 | consumed tokens: 146898944 | elapsed time per iteration (ms): 101866.2 | learning rate: 4.680E-05 | global batch size: 2048 | lm loss: 4.872301E+00 | loss scale: 8192.0 | grad norm: 5950.747 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 858/ 292968 | consumed samples: 1757184 | consumed tokens: 147111936 | elapsed time per iteration (ms): 100241.8 | learning rate: 4.686E-05 | global batch size: 2048 | lm loss: 4.864903E+00 | loss scale: 8192.0 | grad norm: 8402.951 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 859/ 292968 | consumed samples: 1759232 | consumed tokens: 147324928 | elapsed time per iteration (ms): 99453.9 | learning rate: 4.691E-05 | global batch size: 2048 | lm loss: 4.896625E+00 | loss scale: 8192.0 | grad norm: 10338.239 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 860/ 292968 | consumed samples: 1761280 | consumed tokens: 147537920 | elapsed time per iteration (ms): 100045.5 | learning rate: 4.697E-05 | global batch size: 2048 | lm loss: 4.874730E+00 | loss scale: 8192.0 | grad norm: 9924.164 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 861/ 292968 | consumed samples: 1763328 | consumed tokens: 147750912 | elapsed time per iteration (ms): 101019.5 | learning rate: 4.702E-05 | global batch size: 2048 | lm loss: 4.858073E+00 | loss scale: 8192.0 | grad norm: 6834.896 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 862/ 292968 | consumed samples: 1765376 | consumed tokens: 147963904 | elapsed time per iteration (ms): 101431.9 | learning rate: 4.708E-05 | global batch size: 2048 | lm loss: 4.860143E+00 | loss scale: 8192.0 | grad norm: 9179.605 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 863/ 292968 | consumed samples: 1767424 | consumed tokens: 148176896 | elapsed time per iteration (ms): 99828.8 | learning rate: 4.713E-05 | global batch size: 2048 | lm loss: 4.875809E+00 | loss scale: 8192.0 | grad norm: 7926.040 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 864/ 292968 | consumed samples: 1769472 | consumed tokens: 148389888 | elapsed time per iteration (ms): 95553.4 | learning rate: 4.719E-05 | global batch size: 2048 | lm loss: 4.865411E+00 | loss scale: 8192.0 | grad norm: 7441.254 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 865/ 292968 | consumed samples: 1771520 | consumed tokens: 148602880 | elapsed time per iteration (ms): 93756.0 | learning rate: 4.724E-05 | global batch size: 2048 | lm loss: 4.852753E+00 | loss scale: 8192.0 | grad norm: 8675.096 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 866/ 292968 | consumed samples: 1773568 | consumed tokens: 148815872 | elapsed time per iteration (ms): 97398.2 | learning rate: 4.730E-05 | global batch size: 2048 | lm loss: 4.847681E+00 | loss scale: 8192.0 | grad norm: 7610.470 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 867/ 292968 | consumed samples: 1775616 | consumed tokens: 149028864 | elapsed time per iteration (ms): 102171.5 | learning rate: 4.735E-05 | global batch size: 2048 | lm loss: 4.854671E+00 | loss scale: 8192.0 | grad norm: 7714.149 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 868/ 292968 | consumed samples: 1777664 | consumed tokens: 149241856 | elapsed time per iteration (ms): 104486.1 | learning rate: 4.740E-05 | global batch size: 2048 | lm loss: 4.855896E+00 | loss scale: 8192.0 | grad norm: 11444.594 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 869/ 292968 | consumed samples: 1779712 | consumed tokens: 149454848 | elapsed time per iteration (ms): 97759.5 | learning rate: 4.746E-05 | global batch size: 2048 | lm loss: 4.848274E+00 | loss scale: 8192.0 | grad norm: 9475.868 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 870/ 292968 | consumed samples: 1781760 | consumed tokens: 149667840 | elapsed time per iteration (ms): 105938.5 | learning rate: 4.751E-05 | global batch size: 2048 | lm loss: 4.878920E+00 | loss scale: 8192.0 | grad norm: 6823.121 | num zeros: 0.0 | curriculum seqlen: 104 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 871/ 292968 | consumed samples: 1783808 | consumed tokens: 149897216 | elapsed time per iteration (ms): 104269.6 | learning rate: 4.757E-05 | global batch size: 2048 | lm loss: 4.930564E+00 | loss scale: 8192.0 | grad norm: 12571.704 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 872/ 292968 | consumed samples: 1785856 | consumed tokens: 150126592 | elapsed time per iteration (ms): 99263.8 | learning rate: 4.762E-05 | global batch size: 2048 | lm loss: 4.886007E+00 | loss scale: 8192.0 | grad norm: 7772.988 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 873/ 292968 | consumed samples: 1787904 | consumed tokens: 150355968 | elapsed time per iteration (ms): 99180.7 | learning rate: 4.768E-05 | global batch size: 2048 | lm loss: 4.948179E+00 | loss scale: 8192.0 | grad norm: 12283.943 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 874/ 292968 | consumed samples: 1789952 | consumed tokens: 150585344 | elapsed time per iteration (ms): 101656.8 | learning rate: 4.773E-05 | global batch size: 2048 | lm loss: 4.955140E+00 | loss scale: 8192.0 | grad norm: 12319.417 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 875/ 292968 | consumed samples: 1792000 | consumed tokens: 150814720 | elapsed time per iteration (ms): 103135.8 | learning rate: 4.779E-05 | global batch size: 2048 | lm loss: 4.902682E+00 | loss scale: 8192.0 | grad norm: 9807.029 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 876/ 292968 | consumed samples: 1794048 | consumed tokens: 151044096 | elapsed time per iteration (ms): 98803.0 | learning rate: 4.784E-05 | global batch size: 2048 | lm loss: 4.936249E+00 | loss scale: 8192.0 | grad norm: 10397.715 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 877/ 292968 | consumed samples: 1796096 | consumed tokens: 151273472 | elapsed time per iteration (ms): 96398.1 | learning rate: 4.790E-05 | global batch size: 2048 | lm loss: 4.918822E+00 | loss scale: 8192.0 | grad norm: 7879.327 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 878/ 292968 | consumed samples: 1798144 | consumed tokens: 151502848 | elapsed time per iteration (ms): 101064.5 | learning rate: 4.795E-05 | global batch size: 2048 | lm loss: 4.919652E+00 | loss scale: 8192.0 | grad norm: 12914.863 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 879/ 292968 | consumed samples: 1800192 | consumed tokens: 151732224 | elapsed time per iteration (ms): 97176.5 | learning rate: 4.801E-05 | global batch size: 2048 | lm loss: 4.911604E+00 | loss scale: 8192.0 | grad norm: 8642.555 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 880/ 292968 | consumed samples: 1802240 | consumed tokens: 151961600 | elapsed time per iteration (ms): 97493.5 | learning rate: 4.806E-05 | global batch size: 2048 | lm loss: 4.883616E+00 | loss scale: 8192.0 | grad norm: 9014.739 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 881/ 292968 | consumed samples: 1804288 | consumed tokens: 152190976 | elapsed time per iteration (ms): 98999.2 | learning rate: 4.811E-05 | global batch size: 2048 | lm loss: 4.922104E+00 | loss scale: 8192.0 | grad norm: 10096.054 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 882/ 292968 | consumed samples: 1806336 | consumed tokens: 152420352 | elapsed time per iteration (ms): 100686.4 | learning rate: 4.817E-05 | global batch size: 2048 | lm loss: 4.881871E+00 | loss scale: 8192.0 | grad norm: 7396.601 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 883/ 292968 | consumed samples: 1808384 | consumed tokens: 152649728 | elapsed time per iteration (ms): 102986.0 | learning rate: 4.822E-05 | global batch size: 2048 | lm loss: 4.875857E+00 | loss scale: 8192.0 | grad norm: 11129.993 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 884/ 292968 | consumed samples: 1810432 | consumed tokens: 152879104 | elapsed time per iteration (ms): 100814.7 | learning rate: 4.828E-05 | global batch size: 2048 | lm loss: 4.873780E+00 | loss scale: 8192.0 | grad norm: 7583.244 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 885/ 292968 | consumed samples: 1812480 | consumed tokens: 153108480 | elapsed time per iteration (ms): 98098.5 | learning rate: 4.833E-05 | global batch size: 2048 | lm loss: 4.876161E+00 | loss scale: 8192.0 | grad norm: 7844.618 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 886/ 292968 | consumed samples: 1814528 | consumed tokens: 153337856 | elapsed time per iteration (ms): 98042.1 | learning rate: 4.839E-05 | global batch size: 2048 | lm loss: 4.846626E+00 | loss scale: 8192.0 | grad norm: 6534.741 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 887/ 292968 | consumed samples: 1816576 | consumed tokens: 153567232 | elapsed time per iteration (ms): 100560.3 | learning rate: 4.844E-05 | global batch size: 2048 | lm loss: 4.858187E+00 | loss scale: 8192.0 | grad norm: 7310.172 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 888/ 292968 | consumed samples: 1818624 | consumed tokens: 153796608 | elapsed time per iteration (ms): 94927.0 | learning rate: 4.850E-05 | global batch size: 2048 | lm loss: 4.865307E+00 | loss scale: 8192.0 | grad norm: 8373.225 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 889/ 292968 | consumed samples: 1820672 | consumed tokens: 154025984 | elapsed time per iteration (ms): 95810.6 | learning rate: 4.855E-05 | global batch size: 2048 | lm loss: 4.873843E+00 | loss scale: 8192.0 | grad norm: 7997.646 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 890/ 292968 | consumed samples: 1822720 | consumed tokens: 154255360 | elapsed time per iteration (ms): 100664.6 | learning rate: 4.861E-05 | global batch size: 2048 | lm loss: 4.854215E+00 | loss scale: 8192.0 | grad norm: 7278.425 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 891/ 292968 | consumed samples: 1824768 | consumed tokens: 154484736 | elapsed time per iteration (ms): 94183.7 | learning rate: 4.866E-05 | global batch size: 2048 | lm loss: 4.831562E+00 | loss scale: 8192.0 | grad norm: 8215.406 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 892/ 292968 | consumed samples: 1826816 | consumed tokens: 154714112 | elapsed time per iteration (ms): 94501.8 | learning rate: 4.872E-05 | global batch size: 2048 | lm loss: 4.822918E+00 | loss scale: 8192.0 | grad norm: 8398.600 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 893/ 292968 | consumed samples: 1828864 | consumed tokens: 154943488 | elapsed time per iteration (ms): 92654.0 | learning rate: 4.877E-05 | global batch size: 2048 | lm loss: 4.790133E+00 | loss scale: 8192.0 | grad norm: 6692.713 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 894/ 292968 | consumed samples: 1830912 | consumed tokens: 155172864 | elapsed time per iteration (ms): 102837.7 | learning rate: 4.882E-05 | global batch size: 2048 | lm loss: 4.805981E+00 | loss scale: 8192.0 | grad norm: 6963.971 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 895/ 292968 | consumed samples: 1832960 | consumed tokens: 155402240 | elapsed time per iteration (ms): 104122.9 | learning rate: 4.888E-05 | global batch size: 2048 | lm loss: 4.783567E+00 | loss scale: 8192.0 | grad norm: 7102.400 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 896/ 292968 | consumed samples: 1835008 | consumed tokens: 155631616 | elapsed time per iteration (ms): 106477.7 | learning rate: 4.893E-05 | global batch size: 2048 | lm loss: 4.777409E+00 | loss scale: 8192.0 | grad norm: 7930.597 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 897/ 292968 | consumed samples: 1837056 | consumed tokens: 155860992 | elapsed time per iteration (ms): 105451.0 | learning rate: 4.899E-05 | global batch size: 2048 | lm loss: 4.824835E+00 | loss scale: 8192.0 | grad norm: 11162.164 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 898/ 292968 | consumed samples: 1839104 | consumed tokens: 156090368 | elapsed time per iteration (ms): 100719.2 | learning rate: 4.904E-05 | global batch size: 2048 | lm loss: 4.781737E+00 | loss scale: 8192.0 | grad norm: 6407.312 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 899/ 292968 | consumed samples: 1841152 | consumed tokens: 156319744 | elapsed time per iteration (ms): 100781.3 | learning rate: 4.910E-05 | global batch size: 2048 | lm loss: 4.789675E+00 | loss scale: 8192.0 | grad norm: 6338.078 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 900/ 292968 | consumed samples: 1843200 | consumed tokens: 156549120 | elapsed time per iteration (ms): 101122.5 | learning rate: 4.915E-05 | global batch size: 2048 | lm loss: 4.794979E+00 | loss scale: 8192.0 | grad norm: 8819.990 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------ - validation loss at iteration 900 | lm loss value: 4.770841E+00 | lm loss PPL: 1.180185E+02 | ------------------------------------------------------------------------------------------------ -saving checkpoint at iteration 900 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-26 00:30:28,063] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/mp_rank_01_model_states.pt -[2021-10-26 00:30:28,279] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/mp_rank_00_model_states.pt -[2021-10-26 00:30:41,148] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-26 00:30:41,173] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-26 00:30:41,176] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-26 00:30:41,254] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-26 00:30:41,286] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-26 00:30:41,287] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-26 00:30:41,336] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-26 00:30:41,360] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-26 00:30:41,427] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-26 00:30:41,429] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-26 00:30:41,437] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-26 00:30:41,439] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-26 00:30:41,461] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-26 00:30:41,481] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-26 00:30:41,507] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-26 00:30:41,541] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-26 00:30:41,585] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-26 00:30:41,609] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-26 00:30:41,660] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-26 00:30:41,675] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-26 00:30:41,692] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-26 00:30:41,700] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-26 00:30:41,715] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-26 00:30:41,744] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-26 00:30:41,784] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-26 00:30:41,833] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-26 00:30:41,900] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-26 00:30:41,908] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-26 00:30:41,937] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-26 00:30:41,940] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-26 00:30:42,096] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-26 00:30:42,160] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-26 00:30:42,234] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-26 00:30:42,272] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-26 00:30:42,287] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-26 00:30:42,330] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-26 00:30:42,331] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-26 00:30:42,351] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-26 00:30:42,356] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-26 00:30:42,357] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-26 00:30:42,376] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-26 00:30:42,379] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-26 00:30:42,416] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-26 00:30:42,496] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-26 00:30:42,501] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-26 00:30:42,502] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-26 00:30:42,515] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-26 00:30:42,515] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-26 00:30:42,530] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-26 00:30:42,548] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-26 00:30:42,553] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-26 00:30:42,568] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-26 00:30:42,570] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-26 00:30:42,572] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-26 00:30:42,581] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-26 00:30:42,585] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-26 00:30:42,585] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-26 00:30:42,590] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-26 00:30:42,599] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-26 00:30:42,616] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-26 00:30:42,617] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-26 00:30:42,618] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-26 00:30:42,635] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-26 00:30:42,641] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-26 00:30:42,642] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-26 00:30:42,649] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-26 00:30:42,649] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-26 00:30:42,672] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-26 00:30:42,707] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-26 00:30:42,717] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-26 00:30:42,737] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-26 00:30:42,761] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-26 00:30:42,770] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-26 00:30:42,776] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-26 00:30:42,777] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-26 00:30:42,814] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-26 00:30:42,890] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-26 00:30:42,892] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-26 00:30:42,894] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-26 00:30:42,993] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-26 00:30:42,996] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-26 00:30:42,997] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-26 00:30:43,043] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-26 00:30:43,088] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-26 00:30:43,113] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-26 00:30:43,147] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-26 00:30:43,148] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-26 00:30:43,154] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-26 00:30:43,155] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-26 00:30:43,173] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-26 00:30:43,220] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-26 00:30:43,223] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-26 00:30:43,224] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-26 00:30:43,229] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-26 00:30:43,238] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-26 00:30:43,261] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-26 00:30:43,280] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-26 00:30:43,288] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-26 00:30:43,304] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-26 00:30:43,316] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-26 00:30:43,400] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-26 00:30:43,439] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-26 00:30:43,458] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-26 00:30:43,460] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-26 00:30:43,465] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-26 00:30:43,475] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-26 00:30:43,496] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-26 00:30:43,570] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-26 00:30:43,581] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-26 00:30:43,687] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-26 00:30:43,752] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-26 00:30:43,926] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-26 00:30:43,941] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-26 00:30:44,037] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-26 00:30:44,287] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-26 00:30:44,297] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-26 00:30:45,173] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-26 00:30:45,313] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-26 00:30:45,654] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-26 00:30:45,745] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-26 00:30:46,493] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-26 00:30:46,531] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-26 00:30:46,844] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-26 00:30:47,643] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-26 00:30:49,082] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-26 00:30:49,603] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-26 00:30:50,070] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-26 00:30:50,392] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step900/zero_pp_rank_0_mp_rank_28_optim_states.pt - successfully saved checkpoint at iteration 900 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 25645.04 - iteration 901/ 292968 | consumed samples: 1845248 | consumed tokens: 156778496 | elapsed time per iteration (ms): 351748.4 | learning rate: 4.921E-05 | global batch size: 2048 | lm loss: 4.794399E+00 | loss scale: 8192.0 | grad norm: 9320.710 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 902/ 292968 | consumed samples: 1847296 | consumed tokens: 157007872 | elapsed time per iteration (ms): 107731.0 | learning rate: 4.926E-05 | global batch size: 2048 | lm loss: 4.813344E+00 | loss scale: 8192.0 | grad norm: 8270.918 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 903/ 292968 | consumed samples: 1849344 | consumed tokens: 157237248 | elapsed time per iteration (ms): 99902.9 | learning rate: 4.932E-05 | global batch size: 2048 | lm loss: 4.777577E+00 | loss scale: 8192.0 | grad norm: 10079.971 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 904/ 292968 | consumed samples: 1851392 | consumed tokens: 157466624 | elapsed time per iteration (ms): 101536.1 | learning rate: 4.937E-05 | global batch size: 2048 | lm loss: 4.804098E+00 | loss scale: 8192.0 | grad norm: 8363.455 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 905/ 292968 | consumed samples: 1853440 | consumed tokens: 157696000 | elapsed time per iteration (ms): 121260.1 | learning rate: 4.943E-05 | global batch size: 2048 | lm loss: 4.775804E+00 | loss scale: 8192.0 | grad norm: 8196.718 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 906/ 292968 | consumed samples: 1855488 | consumed tokens: 157925376 | elapsed time per iteration (ms): 126844.2 | learning rate: 4.948E-05 | global batch size: 2048 | lm loss: 4.804184E+00 | loss scale: 8192.0 | grad norm: 8970.926 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 907/ 292968 | consumed samples: 1857536 | consumed tokens: 158154752 | elapsed time per iteration (ms): 120851.5 | learning rate: 4.953E-05 | global batch size: 2048 | lm loss: 4.794326E+00 | loss scale: 8192.0 | grad norm: 10288.692 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 908/ 292968 | consumed samples: 1859584 | consumed tokens: 158384128 | elapsed time per iteration (ms): 109883.6 | learning rate: 4.959E-05 | global batch size: 2048 | lm loss: 4.766080E+00 | loss scale: 8192.0 | grad norm: 7194.899 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 909/ 292968 | consumed samples: 1861632 | consumed tokens: 158613504 | elapsed time per iteration (ms): 101792.4 | learning rate: 4.964E-05 | global batch size: 2048 | lm loss: 4.791938E+00 | loss scale: 8192.0 | grad norm: 8309.770 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 910/ 292968 | consumed samples: 1863680 | consumed tokens: 158842880 | elapsed time per iteration (ms): 97324.5 | learning rate: 4.970E-05 | global batch size: 2048 | lm loss: 4.780250E+00 | loss scale: 8192.0 | grad norm: 9022.333 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 911/ 292968 | consumed samples: 1865728 | consumed tokens: 159072256 | elapsed time per iteration (ms): 99939.0 | learning rate: 4.975E-05 | global batch size: 2048 | lm loss: 4.790908E+00 | loss scale: 8192.0 | grad norm: 8841.215 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 912/ 292968 | consumed samples: 1867776 | consumed tokens: 159301632 | elapsed time per iteration (ms): 105940.2 | learning rate: 4.981E-05 | global batch size: 2048 | lm loss: 4.776813E+00 | loss scale: 8192.0 | grad norm: 7733.102 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 913/ 292968 | consumed samples: 1869824 | consumed tokens: 159531008 | elapsed time per iteration (ms): 106692.6 | learning rate: 4.986E-05 | global batch size: 2048 | lm loss: 4.776219E+00 | loss scale: 8192.0 | grad norm: 11014.266 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 914/ 292968 | consumed samples: 1871872 | consumed tokens: 159760384 | elapsed time per iteration (ms): 97775.0 | learning rate: 4.992E-05 | global batch size: 2048 | lm loss: 4.754172E+00 | loss scale: 8192.0 | grad norm: 6274.681 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 915/ 292968 | consumed samples: 1873920 | consumed tokens: 159989760 | elapsed time per iteration (ms): 105070.7 | learning rate: 4.997E-05 | global batch size: 2048 | lm loss: 4.767986E+00 | loss scale: 8192.0 | grad norm: 6311.649 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 916/ 292968 | consumed samples: 1875968 | consumed tokens: 160219136 | elapsed time per iteration (ms): 103850.3 | learning rate: 5.003E-05 | global batch size: 2048 | lm loss: 4.747984E+00 | loss scale: 8192.0 | grad norm: 6953.822 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 917/ 292968 | consumed samples: 1878016 | consumed tokens: 160448512 | elapsed time per iteration (ms): 98402.5 | learning rate: 5.008E-05 | global batch size: 2048 | lm loss: 4.758752E+00 | loss scale: 8192.0 | grad norm: 7966.168 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 918/ 292968 | consumed samples: 1880064 | consumed tokens: 160677888 | elapsed time per iteration (ms): 106433.4 | learning rate: 5.014E-05 | global batch size: 2048 | lm loss: 4.750968E+00 | loss scale: 8192.0 | grad norm: 9367.928 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 919/ 292968 | consumed samples: 1882112 | consumed tokens: 160907264 | elapsed time per iteration (ms): 102448.7 | learning rate: 5.019E-05 | global batch size: 2048 | lm loss: 4.737623E+00 | loss scale: 8192.0 | grad norm: 7219.830 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 920/ 292968 | consumed samples: 1884160 | consumed tokens: 161136640 | elapsed time per iteration (ms): 100941.7 | learning rate: 5.024E-05 | global batch size: 2048 | lm loss: 4.743581E+00 | loss scale: 8192.0 | grad norm: 5946.245 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 921/ 292968 | consumed samples: 1886208 | consumed tokens: 161366016 | elapsed time per iteration (ms): 100466.1 | learning rate: 5.030E-05 | global batch size: 2048 | lm loss: 4.742621E+00 | loss scale: 8192.0 | grad norm: 5831.992 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 922/ 292968 | consumed samples: 1888256 | consumed tokens: 161595392 | elapsed time per iteration (ms): 100590.7 | learning rate: 5.035E-05 | global batch size: 2048 | lm loss: 4.759139E+00 | loss scale: 8192.0 | grad norm: 7137.362 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 923/ 292968 | consumed samples: 1890304 | consumed tokens: 161824768 | elapsed time per iteration (ms): 98006.6 | learning rate: 5.041E-05 | global batch size: 2048 | lm loss: 4.745216E+00 | loss scale: 8192.0 | grad norm: 7862.406 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 924/ 292968 | consumed samples: 1892352 | consumed tokens: 162054144 | elapsed time per iteration (ms): 106082.0 | learning rate: 5.046E-05 | global batch size: 2048 | lm loss: 4.744426E+00 | loss scale: 8192.0 | grad norm: 8929.465 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 925/ 292968 | consumed samples: 1894400 | consumed tokens: 162283520 | elapsed time per iteration (ms): 105900.8 | learning rate: 5.052E-05 | global batch size: 2048 | lm loss: 4.734351E+00 | loss scale: 8192.0 | grad norm: 6590.063 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 926/ 292968 | consumed samples: 1896448 | consumed tokens: 162512896 | elapsed time per iteration (ms): 98388.1 | learning rate: 5.057E-05 | global batch size: 2048 | lm loss: 4.713094E+00 | loss scale: 8192.0 | grad norm: 6561.902 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 927/ 292968 | consumed samples: 1898496 | consumed tokens: 162742272 | elapsed time per iteration (ms): 99757.2 | learning rate: 5.063E-05 | global batch size: 2048 | lm loss: 4.726743E+00 | loss scale: 8192.0 | grad norm: 9593.008 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 928/ 292968 | consumed samples: 1900544 | consumed tokens: 162971648 | elapsed time per iteration (ms): 99215.7 | learning rate: 5.068E-05 | global batch size: 2048 | lm loss: 4.732288E+00 | loss scale: 8192.0 | grad norm: 9424.312 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 929/ 292968 | consumed samples: 1902592 | consumed tokens: 163201024 | elapsed time per iteration (ms): 95105.4 | learning rate: 5.074E-05 | global batch size: 2048 | lm loss: 4.710865E+00 | loss scale: 8192.0 | grad norm: 9029.592 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 930/ 292968 | consumed samples: 1904640 | consumed tokens: 163430400 | elapsed time per iteration (ms): 101031.9 | learning rate: 5.079E-05 | global batch size: 2048 | lm loss: 4.735913E+00 | loss scale: 8192.0 | grad norm: 8588.024 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 931/ 292968 | consumed samples: 1906688 | consumed tokens: 163659776 | elapsed time per iteration (ms): 123729.1 | learning rate: 5.085E-05 | global batch size: 2048 | lm loss: 4.713844E+00 | loss scale: 8192.0 | grad norm: 9506.565 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 932/ 292968 | consumed samples: 1908736 | consumed tokens: 163889152 | elapsed time per iteration (ms): 105456.4 | learning rate: 5.090E-05 | global batch size: 2048 | lm loss: 4.714865E+00 | loss scale: 8192.0 | grad norm: 7426.447 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 933/ 292968 | consumed samples: 1910784 | consumed tokens: 164118528 | elapsed time per iteration (ms): 99808.4 | learning rate: 5.095E-05 | global batch size: 2048 | lm loss: 4.720546E+00 | loss scale: 8192.0 | grad norm: 6825.281 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 934/ 292968 | consumed samples: 1912832 | consumed tokens: 164347904 | elapsed time per iteration (ms): 107610.2 | learning rate: 5.101E-05 | global batch size: 2048 | lm loss: 4.711742E+00 | loss scale: 8192.0 | grad norm: 7071.383 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 935/ 292968 | consumed samples: 1914880 | consumed tokens: 164577280 | elapsed time per iteration (ms): 101821.3 | learning rate: 5.106E-05 | global batch size: 2048 | lm loss: 4.748381E+00 | loss scale: 8192.0 | grad norm: 9050.332 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 936/ 292968 | consumed samples: 1916928 | consumed tokens: 164806656 | elapsed time per iteration (ms): 101083.3 | learning rate: 5.112E-05 | global batch size: 2048 | lm loss: 4.759112E+00 | loss scale: 8192.0 | grad norm: 8561.534 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 937/ 292968 | consumed samples: 1918976 | consumed tokens: 165036032 | elapsed time per iteration (ms): 117631.6 | learning rate: 5.117E-05 | global batch size: 2048 | lm loss: 4.700043E+00 | loss scale: 8192.0 | grad norm: 7439.252 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 938/ 292968 | consumed samples: 1921024 | consumed tokens: 165265408 | elapsed time per iteration (ms): 113062.7 | learning rate: 5.123E-05 | global batch size: 2048 | lm loss: 4.711950E+00 | loss scale: 8192.0 | grad norm: 7199.722 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 939/ 292968 | consumed samples: 1923072 | consumed tokens: 165494784 | elapsed time per iteration (ms): 119477.5 | learning rate: 5.128E-05 | global batch size: 2048 | lm loss: 4.746660E+00 | loss scale: 8192.0 | grad norm: 7564.670 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 940/ 292968 | consumed samples: 1925120 | consumed tokens: 165724160 | elapsed time per iteration (ms): 112883.7 | learning rate: 5.134E-05 | global batch size: 2048 | lm loss: 4.725538E+00 | loss scale: 8192.0 | grad norm: 7627.526 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 941/ 292968 | consumed samples: 1927168 | consumed tokens: 165953536 | elapsed time per iteration (ms): 100400.7 | learning rate: 5.139E-05 | global batch size: 2048 | lm loss: 4.711749E+00 | loss scale: 8192.0 | grad norm: 7270.336 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 942/ 292968 | consumed samples: 1929216 | consumed tokens: 166182912 | elapsed time per iteration (ms): 119462.0 | learning rate: 5.145E-05 | global batch size: 2048 | lm loss: 4.698702E+00 | loss scale: 8192.0 | grad norm: 8592.437 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 943/ 292968 | consumed samples: 1931264 | consumed tokens: 166412288 | elapsed time per iteration (ms): 116396.2 | learning rate: 5.150E-05 | global batch size: 2048 | lm loss: 4.733593E+00 | loss scale: 8192.0 | grad norm: 8296.782 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 944/ 292968 | consumed samples: 1933312 | consumed tokens: 166641664 | elapsed time per iteration (ms): 118632.1 | learning rate: 5.155E-05 | global batch size: 2048 | lm loss: 4.727538E+00 | loss scale: 8192.0 | grad norm: 5568.290 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 945/ 292968 | consumed samples: 1935360 | consumed tokens: 166871040 | elapsed time per iteration (ms): 119590.7 | learning rate: 5.161E-05 | global batch size: 2048 | lm loss: 4.700770E+00 | loss scale: 8192.0 | grad norm: 5369.944 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 946/ 292968 | consumed samples: 1937408 | consumed tokens: 167100416 | elapsed time per iteration (ms): 121932.4 | learning rate: 5.166E-05 | global batch size: 2048 | lm loss: 4.719340E+00 | loss scale: 8192.0 | grad norm: 6096.379 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 947/ 292968 | consumed samples: 1939456 | consumed tokens: 167329792 | elapsed time per iteration (ms): 121970.8 | learning rate: 5.172E-05 | global batch size: 2048 | lm loss: 4.700679E+00 | loss scale: 8192.0 | grad norm: 6406.117 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 948/ 292968 | consumed samples: 1941504 | consumed tokens: 167559168 | elapsed time per iteration (ms): 127800.0 | learning rate: 5.177E-05 | global batch size: 2048 | lm loss: 4.723674E+00 | loss scale: 8192.0 | grad norm: 6398.501 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 949/ 292968 | consumed samples: 1943552 | consumed tokens: 167788544 | elapsed time per iteration (ms): 113398.4 | learning rate: 5.183E-05 | global batch size: 2048 | lm loss: 4.729566E+00 | loss scale: 8192.0 | grad norm: 7737.068 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 950/ 292968 | consumed samples: 1945600 | consumed tokens: 168017920 | elapsed time per iteration (ms): 95389.2 | learning rate: 5.188E-05 | global batch size: 2048 | lm loss: 4.711073E+00 | loss scale: 8192.0 | grad norm: 8986.415 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 951/ 292968 | consumed samples: 1947648 | consumed tokens: 168247296 | elapsed time per iteration (ms): 92783.2 | learning rate: 5.194E-05 | global batch size: 2048 | lm loss: 4.718928E+00 | loss scale: 8192.0 | grad norm: 6872.582 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 952/ 292968 | consumed samples: 1949696 | consumed tokens: 168476672 | elapsed time per iteration (ms): 89619.6 | learning rate: 5.199E-05 | global batch size: 2048 | lm loss: 4.680629E+00 | loss scale: 8192.0 | grad norm: 8210.916 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 953/ 292968 | consumed samples: 1951744 | consumed tokens: 168706048 | elapsed time per iteration (ms): 94991.2 | learning rate: 5.205E-05 | global batch size: 2048 | lm loss: 4.708949E+00 | loss scale: 8192.0 | grad norm: 8854.260 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 954/ 292968 | consumed samples: 1953792 | consumed tokens: 168935424 | elapsed time per iteration (ms): 97273.1 | learning rate: 5.210E-05 | global batch size: 2048 | lm loss: 4.701236E+00 | loss scale: 8192.0 | grad norm: 8124.693 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 955/ 292968 | consumed samples: 1955840 | consumed tokens: 169164800 | elapsed time per iteration (ms): 103442.0 | learning rate: 5.216E-05 | global batch size: 2048 | lm loss: 4.700453E+00 | loss scale: 8192.0 | grad norm: 7387.737 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 956/ 292968 | consumed samples: 1957888 | consumed tokens: 169394176 | elapsed time per iteration (ms): 104467.9 | learning rate: 5.221E-05 | global batch size: 2048 | lm loss: 4.691464E+00 | loss scale: 8192.0 | grad norm: 8189.810 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 957/ 292968 | consumed samples: 1959936 | consumed tokens: 169623552 | elapsed time per iteration (ms): 90856.8 | learning rate: 5.226E-05 | global batch size: 2048 | lm loss: 4.680496E+00 | loss scale: 8192.0 | grad norm: 9239.126 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 958/ 292968 | consumed samples: 1961984 | consumed tokens: 169852928 | elapsed time per iteration (ms): 88455.1 | learning rate: 5.232E-05 | global batch size: 2048 | lm loss: 4.697749E+00 | loss scale: 8192.0 | grad norm: 5186.737 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 959/ 292968 | consumed samples: 1964032 | consumed tokens: 170082304 | elapsed time per iteration (ms): 89351.8 | learning rate: 5.237E-05 | global batch size: 2048 | lm loss: 4.688345E+00 | loss scale: 8192.0 | grad norm: 7375.103 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 960/ 292968 | consumed samples: 1966080 | consumed tokens: 170311680 | elapsed time per iteration (ms): 93605.5 | learning rate: 5.243E-05 | global batch size: 2048 | lm loss: 4.649884E+00 | loss scale: 8192.0 | grad norm: 7101.403 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 961/ 292968 | consumed samples: 1968128 | consumed tokens: 170541056 | elapsed time per iteration (ms): 99608.4 | learning rate: 5.248E-05 | global batch size: 2048 | lm loss: 4.661988E+00 | loss scale: 8192.0 | grad norm: 6274.319 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 962/ 292968 | consumed samples: 1970176 | consumed tokens: 170770432 | elapsed time per iteration (ms): 112064.2 | learning rate: 5.254E-05 | global batch size: 2048 | lm loss: 4.675498E+00 | loss scale: 8192.0 | grad norm: 6863.761 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 963/ 292968 | consumed samples: 1972224 | consumed tokens: 170999808 | elapsed time per iteration (ms): 102640.7 | learning rate: 5.259E-05 | global batch size: 2048 | lm loss: 4.668849E+00 | loss scale: 8192.0 | grad norm: 7405.085 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 964/ 292968 | consumed samples: 1974272 | consumed tokens: 171229184 | elapsed time per iteration (ms): 95944.9 | learning rate: 5.265E-05 | global batch size: 2048 | lm loss: 4.662077E+00 | loss scale: 8192.0 | grad norm: 7943.465 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 965/ 292968 | consumed samples: 1976320 | consumed tokens: 171458560 | elapsed time per iteration (ms): 98512.5 | learning rate: 5.270E-05 | global batch size: 2048 | lm loss: 4.703004E+00 | loss scale: 8192.0 | grad norm: 7356.277 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 966/ 292968 | consumed samples: 1978368 | consumed tokens: 171687936 | elapsed time per iteration (ms): 112302.3 | learning rate: 5.276E-05 | global batch size: 2048 | lm loss: 4.669021E+00 | loss scale: 8192.0 | grad norm: 6468.502 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 967/ 292968 | consumed samples: 1980416 | consumed tokens: 171917312 | elapsed time per iteration (ms): 109696.7 | learning rate: 5.281E-05 | global batch size: 2048 | lm loss: 4.685811E+00 | loss scale: 8192.0 | grad norm: 7984.873 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 968/ 292968 | consumed samples: 1982464 | consumed tokens: 172146688 | elapsed time per iteration (ms): 110874.2 | learning rate: 5.287E-05 | global batch size: 2048 | lm loss: 4.684606E+00 | loss scale: 8192.0 | grad norm: 9533.941 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 969/ 292968 | consumed samples: 1984512 | consumed tokens: 172376064 | elapsed time per iteration (ms): 108139.3 | learning rate: 5.292E-05 | global batch size: 2048 | lm loss: 4.651761E+00 | loss scale: 8192.0 | grad norm: 9383.782 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 970/ 292968 | consumed samples: 1986560 | consumed tokens: 172605440 | elapsed time per iteration (ms): 104049.0 | learning rate: 5.297E-05 | global batch size: 2048 | lm loss: 4.671356E+00 | loss scale: 8192.0 | grad norm: 8579.966 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 971/ 292968 | consumed samples: 1988608 | consumed tokens: 172834816 | elapsed time per iteration (ms): 101054.4 | learning rate: 5.303E-05 | global batch size: 2048 | lm loss: 4.653022E+00 | loss scale: 8192.0 | grad norm: 7775.476 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 972/ 292968 | consumed samples: 1990656 | consumed tokens: 173064192 | elapsed time per iteration (ms): 113876.9 | learning rate: 5.308E-05 | global batch size: 2048 | lm loss: 4.682260E+00 | loss scale: 8192.0 | grad norm: 7938.946 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 973/ 292968 | consumed samples: 1992704 | consumed tokens: 173293568 | elapsed time per iteration (ms): 110543.2 | learning rate: 5.314E-05 | global batch size: 2048 | lm loss: 4.655627E+00 | loss scale: 8192.0 | grad norm: 8926.092 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 974/ 292968 | consumed samples: 1994752 | consumed tokens: 173522944 | elapsed time per iteration (ms): 112023.2 | learning rate: 5.319E-05 | global batch size: 2048 | lm loss: 4.666007E+00 | loss scale: 8192.0 | grad norm: 9307.366 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 975/ 292968 | consumed samples: 1996800 | consumed tokens: 173752320 | elapsed time per iteration (ms): 107832.9 | learning rate: 5.325E-05 | global batch size: 2048 | lm loss: 4.650480E+00 | loss scale: 8192.0 | grad norm: 8410.476 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 976/ 292968 | consumed samples: 1998848 | consumed tokens: 173981696 | elapsed time per iteration (ms): 106612.7 | learning rate: 5.330E-05 | global batch size: 2048 | lm loss: 4.662186E+00 | loss scale: 8192.0 | grad norm: 7944.755 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 977/ 292968 | consumed samples: 2000896 | consumed tokens: 174211072 | elapsed time per iteration (ms): 99996.4 | learning rate: 5.336E-05 | global batch size: 2048 | lm loss: 4.647754E+00 | loss scale: 8192.0 | grad norm: 8004.437 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 978/ 292968 | consumed samples: 2002944 | consumed tokens: 174440448 | elapsed time per iteration (ms): 93238.2 | learning rate: 5.341E-05 | global batch size: 2048 | lm loss: 4.634857E+00 | loss scale: 8192.0 | grad norm: 7261.665 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 979/ 292968 | consumed samples: 2004992 | consumed tokens: 174669824 | elapsed time per iteration (ms): 91512.2 | learning rate: 5.347E-05 | global batch size: 2048 | lm loss: 4.680102E+00 | loss scale: 8192.0 | grad norm: 7111.941 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 980/ 292968 | consumed samples: 2007040 | consumed tokens: 174899200 | elapsed time per iteration (ms): 91030.0 | learning rate: 5.352E-05 | global batch size: 2048 | lm loss: 4.652774E+00 | loss scale: 8192.0 | grad norm: 7223.700 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 981/ 292968 | consumed samples: 2009088 | consumed tokens: 175128576 | elapsed time per iteration (ms): 101603.6 | learning rate: 5.358E-05 | global batch size: 2048 | lm loss: 4.663350E+00 | loss scale: 8192.0 | grad norm: 8987.379 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 982/ 292968 | consumed samples: 2011136 | consumed tokens: 175357952 | elapsed time per iteration (ms): 101200.3 | learning rate: 5.363E-05 | global batch size: 2048 | lm loss: 4.633442E+00 | loss scale: 8192.0 | grad norm: 7379.221 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 983/ 292968 | consumed samples: 2013184 | consumed tokens: 175587328 | elapsed time per iteration (ms): 108927.4 | learning rate: 5.368E-05 | global batch size: 2048 | lm loss: 4.665020E+00 | loss scale: 8192.0 | grad norm: 8020.847 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 984/ 292968 | consumed samples: 2015232 | consumed tokens: 175816704 | elapsed time per iteration (ms): 95944.9 | learning rate: 5.374E-05 | global batch size: 2048 | lm loss: 4.634257E+00 | loss scale: 8192.0 | grad norm: 8887.790 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 985/ 292968 | consumed samples: 2017280 | consumed tokens: 176046080 | elapsed time per iteration (ms): 100638.9 | learning rate: 5.379E-05 | global batch size: 2048 | lm loss: 4.611258E+00 | loss scale: 8192.0 | grad norm: 6280.043 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 986/ 292968 | consumed samples: 2019328 | consumed tokens: 176275456 | elapsed time per iteration (ms): 96832.0 | learning rate: 5.385E-05 | global batch size: 2048 | lm loss: 4.650913E+00 | loss scale: 8192.0 | grad norm: 9012.969 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 987/ 292968 | consumed samples: 2021376 | consumed tokens: 176504832 | elapsed time per iteration (ms): 92614.8 | learning rate: 5.390E-05 | global batch size: 2048 | lm loss: 4.636930E+00 | loss scale: 8192.0 | grad norm: 9856.347 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 988/ 292968 | consumed samples: 2023424 | consumed tokens: 176734208 | elapsed time per iteration (ms): 95657.7 | learning rate: 5.396E-05 | global batch size: 2048 | lm loss: 4.644852E+00 | loss scale: 8192.0 | grad norm: 5854.301 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 989/ 292968 | consumed samples: 2025472 | consumed tokens: 176963584 | elapsed time per iteration (ms): 103508.0 | learning rate: 5.401E-05 | global batch size: 2048 | lm loss: 4.662253E+00 | loss scale: 8192.0 | grad norm: 5325.829 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 990/ 292968 | consumed samples: 2027520 | consumed tokens: 177192960 | elapsed time per iteration (ms): 104861.5 | learning rate: 5.407E-05 | global batch size: 2048 | lm loss: 4.632265E+00 | loss scale: 8192.0 | grad norm: 5202.282 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 991/ 292968 | consumed samples: 2029568 | consumed tokens: 177422336 | elapsed time per iteration (ms): 95359.3 | learning rate: 5.412E-05 | global batch size: 2048 | lm loss: 4.647697E+00 | loss scale: 8192.0 | grad norm: 5178.402 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 992/ 292968 | consumed samples: 2031616 | consumed tokens: 177651712 | elapsed time per iteration (ms): 89337.9 | learning rate: 5.418E-05 | global batch size: 2048 | lm loss: 4.625400E+00 | loss scale: 8192.0 | grad norm: 5287.370 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 993/ 292968 | consumed samples: 2033664 | consumed tokens: 177881088 | elapsed time per iteration (ms): 87631.8 | learning rate: 5.423E-05 | global batch size: 2048 | lm loss: 4.638201E+00 | loss scale: 8192.0 | grad norm: 6183.282 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 994/ 292968 | consumed samples: 2035712 | consumed tokens: 178110464 | elapsed time per iteration (ms): 90783.4 | learning rate: 5.429E-05 | global batch size: 2048 | lm loss: 4.634704E+00 | loss scale: 8192.0 | grad norm: 7452.254 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 995/ 292968 | consumed samples: 2037760 | consumed tokens: 178339840 | elapsed time per iteration (ms): 99774.6 | learning rate: 5.434E-05 | global batch size: 2048 | lm loss: 4.647691E+00 | loss scale: 8192.0 | grad norm: 7479.692 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 996/ 292968 | consumed samples: 2039808 | consumed tokens: 178569216 | elapsed time per iteration (ms): 103529.1 | learning rate: 5.439E-05 | global batch size: 2048 | lm loss: 4.679801E+00 | loss scale: 8192.0 | grad norm: 8703.668 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 997/ 292968 | consumed samples: 2041856 | consumed tokens: 178798592 | elapsed time per iteration (ms): 109616.8 | learning rate: 5.445E-05 | global batch size: 2048 | lm loss: 4.625977E+00 | loss scale: 8192.0 | grad norm: 11308.011 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 998/ 292968 | consumed samples: 2043904 | consumed tokens: 179027968 | elapsed time per iteration (ms): 92923.3 | learning rate: 5.450E-05 | global batch size: 2048 | lm loss: 4.631541E+00 | loss scale: 8192.0 | grad norm: 7287.371 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 999/ 292968 | consumed samples: 2045952 | consumed tokens: 179257344 | elapsed time per iteration (ms): 90731.0 | learning rate: 5.456E-05 | global batch size: 2048 | lm loss: 4.669638E+00 | loss scale: 8192.0 | grad norm: 6599.368 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1000/ 292968 | consumed samples: 2048000 | consumed tokens: 179486720 | elapsed time per iteration (ms): 86701.8 | learning rate: 5.461E-05 | global batch size: 2048 | lm loss: 4.644469E+00 | loss scale: 16384.0 | grad norm: 6042.956 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1001/ 292968 | consumed samples: 2050048 | consumed tokens: 179716096 | elapsed time per iteration (ms): 84089.2 | learning rate: 5.467E-05 | global batch size: 2048 | lm loss: 4.632737E+00 | loss scale: 16384.0 | grad norm: 12576.031 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1002/ 292968 | consumed samples: 2052096 | consumed tokens: 179945472 | elapsed time per iteration (ms): 92512.8 | learning rate: 5.472E-05 | global batch size: 2048 | lm loss: 4.635237E+00 | loss scale: 16384.0 | grad norm: 16538.941 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1003/ 292968 | consumed samples: 2054144 | consumed tokens: 180174848 | elapsed time per iteration (ms): 97871.8 | learning rate: 5.478E-05 | global batch size: 2048 | lm loss: 4.622490E+00 | loss scale: 16384.0 | grad norm: 16097.182 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1004/ 292968 | consumed samples: 2056192 | consumed tokens: 180404224 | elapsed time per iteration (ms): 101193.9 | learning rate: 5.483E-05 | global batch size: 2048 | lm loss: 4.629216E+00 | loss scale: 16384.0 | grad norm: 19749.756 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1005/ 292968 | consumed samples: 2058240 | consumed tokens: 180633600 | elapsed time per iteration (ms): 88368.8 | learning rate: 5.489E-05 | global batch size: 2048 | lm loss: 4.640414E+00 | loss scale: 16384.0 | grad norm: 18825.119 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1006/ 292968 | consumed samples: 2060288 | consumed tokens: 180862976 | elapsed time per iteration (ms): 88871.6 | learning rate: 5.494E-05 | global batch size: 2048 | lm loss: 4.615625E+00 | loss scale: 16384.0 | grad norm: 13281.710 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1007/ 292968 | consumed samples: 2062336 | consumed tokens: 181092352 | elapsed time per iteration (ms): 84725.6 | learning rate: 5.500E-05 | global batch size: 2048 | lm loss: 4.622626E+00 | loss scale: 16384.0 | grad norm: 13062.628 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1008/ 292968 | consumed samples: 2064384 | consumed tokens: 181321728 | elapsed time per iteration (ms): 87689.7 | learning rate: 5.505E-05 | global batch size: 2048 | lm loss: 4.620416E+00 | loss scale: 16384.0 | grad norm: 13096.769 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1009/ 292968 | consumed samples: 2066432 | consumed tokens: 181551104 | elapsed time per iteration (ms): 94584.9 | learning rate: 5.510E-05 | global batch size: 2048 | lm loss: 4.593011E+00 | loss scale: 16384.0 | grad norm: 13236.008 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1010/ 292968 | consumed samples: 2068480 | consumed tokens: 181780480 | elapsed time per iteration (ms): 111032.7 | learning rate: 5.516E-05 | global batch size: 2048 | lm loss: 4.610099E+00 | loss scale: 16384.0 | grad norm: 11561.061 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1011/ 292968 | consumed samples: 2070528 | consumed tokens: 182009856 | elapsed time per iteration (ms): 112746.1 | learning rate: 5.521E-05 | global batch size: 2048 | lm loss: 4.616605E+00 | loss scale: 16384.0 | grad norm: 11205.156 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1012/ 292968 | consumed samples: 2072576 | consumed tokens: 182239232 | elapsed time per iteration (ms): 95249.3 | learning rate: 5.527E-05 | global batch size: 2048 | lm loss: 4.609317E+00 | loss scale: 16384.0 | grad norm: 16521.984 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1013/ 292968 | consumed samples: 2074624 | consumed tokens: 182468608 | elapsed time per iteration (ms): 89883.6 | learning rate: 5.532E-05 | global batch size: 2048 | lm loss: 4.627338E+00 | loss scale: 16384.0 | grad norm: 18254.591 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1014/ 292968 | consumed samples: 2076672 | consumed tokens: 182697984 | elapsed time per iteration (ms): 87517.6 | learning rate: 5.538E-05 | global batch size: 2048 | lm loss: 4.635683E+00 | loss scale: 16384.0 | grad norm: 12703.886 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1015/ 292968 | consumed samples: 2078720 | consumed tokens: 182927360 | elapsed time per iteration (ms): 90408.3 | learning rate: 5.543E-05 | global batch size: 2048 | lm loss: 4.639174E+00 | loss scale: 16384.0 | grad norm: 15780.033 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1016/ 292968 | consumed samples: 2080768 | consumed tokens: 183156736 | elapsed time per iteration (ms): 95790.8 | learning rate: 5.549E-05 | global batch size: 2048 | lm loss: 4.651465E+00 | loss scale: 16384.0 | grad norm: 18357.846 | num zeros: 0.0 | curriculum seqlen: 112 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1017/ 292968 | consumed samples: 2082816 | consumed tokens: 183402496 | elapsed time per iteration (ms): 96605.6 | learning rate: 5.554E-05 | global batch size: 2048 | lm loss: 4.730868E+00 | loss scale: 16384.0 | grad norm: 19856.035 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1018/ 292968 | consumed samples: 2084864 | consumed tokens: 183648256 | elapsed time per iteration (ms): 98007.6 | learning rate: 5.560E-05 | global batch size: 2048 | lm loss: 4.720872E+00 | loss scale: 16384.0 | grad norm: 17217.167 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1019/ 292968 | consumed samples: 2086912 | consumed tokens: 183894016 | elapsed time per iteration (ms): 95688.6 | learning rate: 5.565E-05 | global batch size: 2048 | lm loss: 4.744712E+00 | loss scale: 16384.0 | grad norm: 27358.685 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1020/ 292968 | consumed samples: 2088960 | consumed tokens: 184139776 | elapsed time per iteration (ms): 91725.7 | learning rate: 5.571E-05 | global batch size: 2048 | lm loss: 4.717334E+00 | loss scale: 16384.0 | grad norm: 18848.747 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1021/ 292968 | consumed samples: 2091008 | consumed tokens: 184385536 | elapsed time per iteration (ms): 83200.5 | learning rate: 5.576E-05 | global batch size: 2048 | lm loss: 4.709915E+00 | loss scale: 16384.0 | grad norm: 13274.099 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1022/ 292968 | consumed samples: 2093056 | consumed tokens: 184631296 | elapsed time per iteration (ms): 84514.6 | learning rate: 5.581E-05 | global batch size: 2048 | lm loss: 4.699132E+00 | loss scale: 16384.0 | grad norm: 16529.449 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1023/ 292968 | consumed samples: 2095104 | consumed tokens: 184877056 | elapsed time per iteration (ms): 88438.2 | learning rate: 5.587E-05 | global batch size: 2048 | lm loss: 4.701824E+00 | loss scale: 16384.0 | grad norm: 19286.814 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1024/ 292968 | consumed samples: 2097152 | consumed tokens: 185122816 | elapsed time per iteration (ms): 87128.2 | learning rate: 5.592E-05 | global batch size: 2048 | lm loss: 4.665220E+00 | loss scale: 16384.0 | grad norm: 15609.304 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1025/ 292968 | consumed samples: 2099200 | consumed tokens: 185368576 | elapsed time per iteration (ms): 85968.3 | learning rate: 5.598E-05 | global batch size: 2048 | lm loss: 4.640831E+00 | loss scale: 16384.0 | grad norm: 19676.576 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1026/ 292968 | consumed samples: 2101248 | consumed tokens: 185614336 | elapsed time per iteration (ms): 83642.8 | learning rate: 5.603E-05 | global batch size: 2048 | lm loss: 4.646817E+00 | loss scale: 16384.0 | grad norm: 13333.088 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1027/ 292968 | consumed samples: 2103296 | consumed tokens: 185860096 | elapsed time per iteration (ms): 86824.0 | learning rate: 5.609E-05 | global batch size: 2048 | lm loss: 4.639713E+00 | loss scale: 16384.0 | grad norm: 16080.814 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1028/ 292968 | consumed samples: 2105344 | consumed tokens: 186105856 | elapsed time per iteration (ms): 87095.6 | learning rate: 5.614E-05 | global batch size: 2048 | lm loss: 4.648982E+00 | loss scale: 16384.0 | grad norm: 16331.743 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1029/ 292968 | consumed samples: 2107392 | consumed tokens: 186351616 | elapsed time per iteration (ms): 92620.7 | learning rate: 5.620E-05 | global batch size: 2048 | lm loss: 4.633156E+00 | loss scale: 16384.0 | grad norm: 14530.201 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1030/ 292968 | consumed samples: 2109440 | consumed tokens: 186597376 | elapsed time per iteration (ms): 92232.1 | learning rate: 5.625E-05 | global batch size: 2048 | lm loss: 4.643631E+00 | loss scale: 16384.0 | grad norm: 14406.385 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1031/ 292968 | consumed samples: 2111488 | consumed tokens: 186843136 | elapsed time per iteration (ms): 95216.0 | learning rate: 5.631E-05 | global batch size: 2048 | lm loss: 4.639384E+00 | loss scale: 16384.0 | grad norm: 16406.436 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1032/ 292968 | consumed samples: 2113536 | consumed tokens: 187088896 | elapsed time per iteration (ms): 94094.3 | learning rate: 5.636E-05 | global batch size: 2048 | lm loss: 4.619623E+00 | loss scale: 16384.0 | grad norm: 13155.816 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1033/ 292968 | consumed samples: 2115584 | consumed tokens: 187334656 | elapsed time per iteration (ms): 96697.7 | learning rate: 5.642E-05 | global batch size: 2048 | lm loss: 4.602153E+00 | loss scale: 16384.0 | grad norm: 11455.173 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1034/ 292968 | consumed samples: 2117632 | consumed tokens: 187580416 | elapsed time per iteration (ms): 86040.4 | learning rate: 5.647E-05 | global batch size: 2048 | lm loss: 4.580738E+00 | loss scale: 16384.0 | grad norm: 16306.380 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1035/ 292968 | consumed samples: 2119680 | consumed tokens: 187826176 | elapsed time per iteration (ms): 84865.8 | learning rate: 5.652E-05 | global batch size: 2048 | lm loss: 4.570907E+00 | loss scale: 16384.0 | grad norm: 13308.733 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1036/ 292968 | consumed samples: 2121728 | consumed tokens: 188071936 | elapsed time per iteration (ms): 90659.7 | learning rate: 5.658E-05 | global batch size: 2048 | lm loss: 4.588990E+00 | loss scale: 16384.0 | grad norm: 11695.611 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1037/ 292968 | consumed samples: 2123776 | consumed tokens: 188317696 | elapsed time per iteration (ms): 97978.0 | learning rate: 5.663E-05 | global batch size: 2048 | lm loss: 4.573851E+00 | loss scale: 16384.0 | grad norm: 10910.782 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1038/ 292968 | consumed samples: 2125824 | consumed tokens: 188563456 | elapsed time per iteration (ms): 92816.0 | learning rate: 5.669E-05 | global batch size: 2048 | lm loss: 4.555734E+00 | loss scale: 16384.0 | grad norm: 8363.462 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1039/ 292968 | consumed samples: 2127872 | consumed tokens: 188809216 | elapsed time per iteration (ms): 86340.8 | learning rate: 5.674E-05 | global batch size: 2048 | lm loss: 4.585401E+00 | loss scale: 16384.0 | grad norm: 10335.560 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1040/ 292968 | consumed samples: 2129920 | consumed tokens: 189054976 | elapsed time per iteration (ms): 85202.4 | learning rate: 5.680E-05 | global batch size: 2048 | lm loss: 4.573298E+00 | loss scale: 16384.0 | grad norm: 11608.905 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1041/ 292968 | consumed samples: 2131968 | consumed tokens: 189300736 | elapsed time per iteration (ms): 87124.6 | learning rate: 5.685E-05 | global batch size: 2048 | lm loss: 4.556459E+00 | loss scale: 16384.0 | grad norm: 14485.830 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1042/ 292968 | consumed samples: 2134016 | consumed tokens: 189546496 | elapsed time per iteration (ms): 91170.1 | learning rate: 5.691E-05 | global batch size: 2048 | lm loss: 4.579956E+00 | loss scale: 16384.0 | grad norm: 14404.215 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1043/ 292968 | consumed samples: 2136064 | consumed tokens: 189792256 | elapsed time per iteration (ms): 86918.4 | learning rate: 5.696E-05 | global batch size: 2048 | lm loss: 4.575088E+00 | loss scale: 16384.0 | grad norm: 19708.669 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1044/ 292968 | consumed samples: 2138112 | consumed tokens: 190038016 | elapsed time per iteration (ms): 84138.4 | learning rate: 5.702E-05 | global batch size: 2048 | lm loss: 4.584353E+00 | loss scale: 16384.0 | grad norm: 17005.198 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1045/ 292968 | consumed samples: 2140160 | consumed tokens: 190283776 | elapsed time per iteration (ms): 85099.9 | learning rate: 5.707E-05 | global batch size: 2048 | lm loss: 4.568626E+00 | loss scale: 16384.0 | grad norm: 12284.854 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1046/ 292968 | consumed samples: 2142208 | consumed tokens: 190529536 | elapsed time per iteration (ms): 82683.6 | learning rate: 5.713E-05 | global batch size: 2048 | lm loss: 4.594832E+00 | loss scale: 16384.0 | grad norm: 12682.882 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1047/ 292968 | consumed samples: 2144256 | consumed tokens: 190775296 | elapsed time per iteration (ms): 81428.4 | learning rate: 5.718E-05 | global batch size: 2048 | lm loss: 4.556969E+00 | loss scale: 16384.0 | grad norm: 23148.174 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1048/ 292968 | consumed samples: 2146304 | consumed tokens: 191021056 | elapsed time per iteration (ms): 81907.0 | learning rate: 5.723E-05 | global batch size: 2048 | lm loss: 4.562154E+00 | loss scale: 16384.0 | grad norm: 19384.473 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1049/ 292968 | consumed samples: 2148352 | consumed tokens: 191266816 | elapsed time per iteration (ms): 81029.2 | learning rate: 5.729E-05 | global batch size: 2048 | lm loss: 4.583487E+00 | loss scale: 16384.0 | grad norm: 14592.592 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1050/ 292968 | consumed samples: 2150400 | consumed tokens: 191512576 | elapsed time per iteration (ms): 82072.1 | learning rate: 5.734E-05 | global batch size: 2048 | lm loss: 4.596055E+00 | loss scale: 16384.0 | grad norm: 12900.253 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 1050 | lm loss value: 4.542943E+00 | lm loss PPL: 9.396694E+01 | ------------------------------------------------------------------------------------------------- - iteration 1051/ 292968 | consumed samples: 2152448 | consumed tokens: 191758336 | elapsed time per iteration (ms): 245795.0 | learning rate: 5.740E-05 | global batch size: 2048 | lm loss: 4.570917E+00 | loss scale: 16384.0 | grad norm: 10347.319 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1052/ 292968 | consumed samples: 2154496 | consumed tokens: 192004096 | elapsed time per iteration (ms): 88439.5 | learning rate: 5.745E-05 | global batch size: 2048 | lm loss: 4.576051E+00 | loss scale: 16384.0 | grad norm: 9439.837 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1053/ 292968 | consumed samples: 2156544 | consumed tokens: 192249856 | elapsed time per iteration (ms): 85478.1 | learning rate: 5.751E-05 | global batch size: 2048 | lm loss: 4.568721E+00 | loss scale: 16384.0 | grad norm: 11197.219 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1054/ 292968 | consumed samples: 2158592 | consumed tokens: 192495616 | elapsed time per iteration (ms): 84861.5 | learning rate: 5.756E-05 | global batch size: 2048 | lm loss: 4.559023E+00 | loss scale: 16384.0 | grad norm: 13635.982 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1055/ 292968 | consumed samples: 2160640 | consumed tokens: 192741376 | elapsed time per iteration (ms): 86520.1 | learning rate: 5.762E-05 | global batch size: 2048 | lm loss: 4.572903E+00 | loss scale: 16384.0 | grad norm: 14099.722 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1056/ 292968 | consumed samples: 2162688 | consumed tokens: 192987136 | elapsed time per iteration (ms): 84017.2 | learning rate: 5.767E-05 | global batch size: 2048 | lm loss: 4.569467E+00 | loss scale: 16384.0 | grad norm: 14507.478 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1057/ 292968 | consumed samples: 2164736 | consumed tokens: 193232896 | elapsed time per iteration (ms): 85371.1 | learning rate: 5.773E-05 | global batch size: 2048 | lm loss: 4.562497E+00 | loss scale: 16384.0 | grad norm: 17508.808 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1058/ 292968 | consumed samples: 2166784 | consumed tokens: 193478656 | elapsed time per iteration (ms): 85739.6 | learning rate: 5.778E-05 | global batch size: 2048 | lm loss: 4.548473E+00 | loss scale: 16384.0 | grad norm: 17052.188 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1059/ 292968 | consumed samples: 2168832 | consumed tokens: 193724416 | elapsed time per iteration (ms): 87460.7 | learning rate: 5.784E-05 | global batch size: 2048 | lm loss: 4.562971E+00 | loss scale: 16384.0 | grad norm: 14528.720 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1060/ 292968 | consumed samples: 2170880 | consumed tokens: 193970176 | elapsed time per iteration (ms): 83484.5 | learning rate: 5.789E-05 | global batch size: 2048 | lm loss: 4.552047E+00 | loss scale: 16384.0 | grad norm: 11330.575 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1061/ 292968 | consumed samples: 2172928 | consumed tokens: 194215936 | elapsed time per iteration (ms): 84461.0 | learning rate: 5.794E-05 | global batch size: 2048 | lm loss: 4.533634E+00 | loss scale: 16384.0 | grad norm: 10384.092 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1062/ 292968 | consumed samples: 2174976 | consumed tokens: 194461696 | elapsed time per iteration (ms): 93327.4 | learning rate: 5.800E-05 | global batch size: 2048 | lm loss: 4.548277E+00 | loss scale: 16384.0 | grad norm: 12123.189 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1063/ 292968 | consumed samples: 2177024 | consumed tokens: 194707456 | elapsed time per iteration (ms): 98134.6 | learning rate: 5.805E-05 | global batch size: 2048 | lm loss: 4.528694E+00 | loss scale: 16384.0 | grad norm: 11437.922 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1064/ 292968 | consumed samples: 2179072 | consumed tokens: 194953216 | elapsed time per iteration (ms): 90310.0 | learning rate: 5.811E-05 | global batch size: 2048 | lm loss: 4.568820E+00 | loss scale: 16384.0 | grad norm: 11875.659 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1065/ 292968 | consumed samples: 2181120 | consumed tokens: 195198976 | elapsed time per iteration (ms): 85565.3 | learning rate: 5.816E-05 | global batch size: 2048 | lm loss: 4.540607E+00 | loss scale: 16384.0 | grad norm: 14195.778 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1066/ 292968 | consumed samples: 2183168 | consumed tokens: 195444736 | elapsed time per iteration (ms): 84229.7 | learning rate: 5.822E-05 | global batch size: 2048 | lm loss: 4.550477E+00 | loss scale: 16384.0 | grad norm: 13063.774 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1067/ 292968 | consumed samples: 2185216 | consumed tokens: 195690496 | elapsed time per iteration (ms): 88617.9 | learning rate: 5.827E-05 | global batch size: 2048 | lm loss: 4.511925E+00 | loss scale: 16384.0 | grad norm: 11224.284 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1068/ 292968 | consumed samples: 2187264 | consumed tokens: 195936256 | elapsed time per iteration (ms): 93448.4 | learning rate: 5.833E-05 | global batch size: 2048 | lm loss: 4.546186E+00 | loss scale: 16384.0 | grad norm: 11750.694 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1069/ 292968 | consumed samples: 2189312 | consumed tokens: 196182016 | elapsed time per iteration (ms): 98087.4 | learning rate: 5.838E-05 | global batch size: 2048 | lm loss: 4.518701E+00 | loss scale: 16384.0 | grad norm: 16861.897 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1070/ 292968 | consumed samples: 2191360 | consumed tokens: 196427776 | elapsed time per iteration (ms): 91590.9 | learning rate: 5.844E-05 | global batch size: 2048 | lm loss: 4.572676E+00 | loss scale: 16384.0 | grad norm: 15286.203 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1071/ 292968 | consumed samples: 2193408 | consumed tokens: 196673536 | elapsed time per iteration (ms): 87927.1 | learning rate: 5.849E-05 | global batch size: 2048 | lm loss: 4.546629E+00 | loss scale: 16384.0 | grad norm: 12336.601 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1072/ 292968 | consumed samples: 2195456 | consumed tokens: 196919296 | elapsed time per iteration (ms): 86009.7 | learning rate: 5.855E-05 | global batch size: 2048 | lm loss: 4.520898E+00 | loss scale: 16384.0 | grad norm: 12374.893 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1073/ 292968 | consumed samples: 2197504 | consumed tokens: 197165056 | elapsed time per iteration (ms): 87968.1 | learning rate: 5.860E-05 | global batch size: 2048 | lm loss: 4.525277E+00 | loss scale: 16384.0 | grad norm: 15381.149 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1074/ 292968 | consumed samples: 2199552 | consumed tokens: 197410816 | elapsed time per iteration (ms): 87325.8 | learning rate: 5.865E-05 | global batch size: 2048 | lm loss: 4.524608E+00 | loss scale: 16384.0 | grad norm: 15100.133 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1075/ 292968 | consumed samples: 2201600 | consumed tokens: 197656576 | elapsed time per iteration (ms): 86180.9 | learning rate: 5.871E-05 | global batch size: 2048 | lm loss: 4.544209E+00 | loss scale: 16384.0 | grad norm: 14167.176 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1076/ 292968 | consumed samples: 2203648 | consumed tokens: 197902336 | elapsed time per iteration (ms): 89477.1 | learning rate: 5.876E-05 | global batch size: 2048 | lm loss: 4.547174E+00 | loss scale: 16384.0 | grad norm: 14396.420 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1077/ 292968 | consumed samples: 2205696 | consumed tokens: 198148096 | elapsed time per iteration (ms): 85412.1 | learning rate: 5.882E-05 | global batch size: 2048 | lm loss: 4.549967E+00 | loss scale: 16384.0 | grad norm: 12780.203 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1078/ 292968 | consumed samples: 2207744 | consumed tokens: 198393856 | elapsed time per iteration (ms): 88025.4 | learning rate: 5.887E-05 | global batch size: 2048 | lm loss: 4.521796E+00 | loss scale: 16384.0 | grad norm: 14271.293 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1079/ 292968 | consumed samples: 2209792 | consumed tokens: 198639616 | elapsed time per iteration (ms): 84885.8 | learning rate: 5.893E-05 | global batch size: 2048 | lm loss: 4.544618E+00 | loss scale: 16384.0 | grad norm: 19504.510 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1080/ 292968 | consumed samples: 2211840 | consumed tokens: 198885376 | elapsed time per iteration (ms): 88736.4 | learning rate: 5.898E-05 | global batch size: 2048 | lm loss: 4.543962E+00 | loss scale: 16384.0 | grad norm: 15527.210 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1081/ 292968 | consumed samples: 2213888 | consumed tokens: 199131136 | elapsed time per iteration (ms): 85335.7 | learning rate: 5.904E-05 | global batch size: 2048 | lm loss: 4.550346E+00 | loss scale: 16384.0 | grad norm: 12987.855 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1082/ 292968 | consumed samples: 2215936 | consumed tokens: 199376896 | elapsed time per iteration (ms): 85752.9 | learning rate: 5.909E-05 | global batch size: 2048 | lm loss: 4.522818E+00 | loss scale: 16384.0 | grad norm: 13036.010 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1083/ 292968 | consumed samples: 2217984 | consumed tokens: 199622656 | elapsed time per iteration (ms): 85016.4 | learning rate: 5.915E-05 | global batch size: 2048 | lm loss: 4.546008E+00 | loss scale: 16384.0 | grad norm: 15226.897 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1084/ 292968 | consumed samples: 2220032 | consumed tokens: 199868416 | elapsed time per iteration (ms): 84878.0 | learning rate: 5.920E-05 | global batch size: 2048 | lm loss: 4.554209E+00 | loss scale: 16384.0 | grad norm: 17054.349 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1085/ 292968 | consumed samples: 2222080 | consumed tokens: 200114176 | elapsed time per iteration (ms): 84560.2 | learning rate: 5.926E-05 | global batch size: 2048 | lm loss: 4.523514E+00 | loss scale: 16384.0 | grad norm: 13857.835 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1086/ 292968 | consumed samples: 2224128 | consumed tokens: 200359936 | elapsed time per iteration (ms): 87969.6 | learning rate: 5.931E-05 | global batch size: 2048 | lm loss: 4.505604E+00 | loss scale: 16384.0 | grad norm: 13880.828 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1087/ 292968 | consumed samples: 2226176 | consumed tokens: 200605696 | elapsed time per iteration (ms): 83970.0 | learning rate: 5.936E-05 | global batch size: 2048 | lm loss: 4.529661E+00 | loss scale: 16384.0 | grad norm: 14968.225 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1088/ 292968 | consumed samples: 2228224 | consumed tokens: 200851456 | elapsed time per iteration (ms): 86166.1 | learning rate: 5.942E-05 | global batch size: 2048 | lm loss: 4.514328E+00 | loss scale: 16384.0 | grad norm: 12953.939 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1089/ 292968 | consumed samples: 2230272 | consumed tokens: 201097216 | elapsed time per iteration (ms): 87126.8 | learning rate: 5.947E-05 | global batch size: 2048 | lm loss: 4.512712E+00 | loss scale: 16384.0 | grad norm: 10613.516 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1090/ 292968 | consumed samples: 2232320 | consumed tokens: 201342976 | elapsed time per iteration (ms): 95716.1 | learning rate: 5.953E-05 | global batch size: 2048 | lm loss: 4.535466E+00 | loss scale: 16384.0 | grad norm: 10655.507 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1091/ 292968 | consumed samples: 2234368 | consumed tokens: 201588736 | elapsed time per iteration (ms): 93459.4 | learning rate: 5.958E-05 | global batch size: 2048 | lm loss: 4.515153E+00 | loss scale: 16384.0 | grad norm: 15277.694 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1092/ 292968 | consumed samples: 2236416 | consumed tokens: 201834496 | elapsed time per iteration (ms): 86769.3 | learning rate: 5.964E-05 | global batch size: 2048 | lm loss: 4.519552E+00 | loss scale: 16384.0 | grad norm: 17853.079 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1093/ 292968 | consumed samples: 2238464 | consumed tokens: 202080256 | elapsed time per iteration (ms): 86759.7 | learning rate: 5.969E-05 | global batch size: 2048 | lm loss: 4.472535E+00 | loss scale: 16384.0 | grad norm: 13562.821 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1094/ 292968 | consumed samples: 2240512 | consumed tokens: 202326016 | elapsed time per iteration (ms): 86325.0 | learning rate: 5.975E-05 | global batch size: 2048 | lm loss: 4.504158E+00 | loss scale: 16384.0 | grad norm: 16543.817 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1095/ 292968 | consumed samples: 2242560 | consumed tokens: 202571776 | elapsed time per iteration (ms): 84784.0 | learning rate: 5.980E-05 | global batch size: 2048 | lm loss: 4.483425E+00 | loss scale: 16384.0 | grad norm: 14002.427 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1096/ 292968 | consumed samples: 2244608 | consumed tokens: 202817536 | elapsed time per iteration (ms): 87232.1 | learning rate: 5.986E-05 | global batch size: 2048 | lm loss: 4.528960E+00 | loss scale: 16384.0 | grad norm: 9012.126 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1097/ 292968 | consumed samples: 2246656 | consumed tokens: 203063296 | elapsed time per iteration (ms): 92666.2 | learning rate: 5.991E-05 | global batch size: 2048 | lm loss: 4.505988E+00 | loss scale: 16384.0 | grad norm: 9767.692 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1098/ 292968 | consumed samples: 2248704 | consumed tokens: 203309056 | elapsed time per iteration (ms): 86476.0 | learning rate: 5.997E-05 | global batch size: 2048 | lm loss: 4.498043E+00 | loss scale: 16384.0 | grad norm: 9326.083 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1099/ 292968 | consumed samples: 2250752 | consumed tokens: 203554816 | elapsed time per iteration (ms): 86535.7 | learning rate: 6.002E-05 | global batch size: 2048 | lm loss: 4.498255E+00 | loss scale: 16384.0 | grad norm: 7741.958 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1100/ 292968 | consumed samples: 2252800 | consumed tokens: 203800576 | elapsed time per iteration (ms): 88988.6 | learning rate: 6.007E-05 | global batch size: 2048 | lm loss: 4.514945E+00 | loss scale: 16384.0 | grad norm: 9861.857 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1101/ 292968 | consumed samples: 2254848 | consumed tokens: 204046336 | elapsed time per iteration (ms): 88406.5 | learning rate: 6.013E-05 | global batch size: 2048 | lm loss: 4.494561E+00 | loss scale: 16384.0 | grad norm: 10522.059 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1102/ 292968 | consumed samples: 2256896 | consumed tokens: 204292096 | elapsed time per iteration (ms): 86818.6 | learning rate: 6.018E-05 | global batch size: 2048 | lm loss: 4.514156E+00 | loss scale: 16384.0 | grad norm: 13752.024 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1103/ 292968 | consumed samples: 2258944 | consumed tokens: 204537856 | elapsed time per iteration (ms): 87424.2 | learning rate: 6.024E-05 | global batch size: 2048 | lm loss: 4.487889E+00 | loss scale: 16384.0 | grad norm: 18219.965 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1104/ 292968 | consumed samples: 2260992 | consumed tokens: 204783616 | elapsed time per iteration (ms): 83889.3 | learning rate: 6.029E-05 | global batch size: 2048 | lm loss: 4.505061E+00 | loss scale: 16384.0 | grad norm: 18146.389 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1105/ 292968 | consumed samples: 2263040 | consumed tokens: 205029376 | elapsed time per iteration (ms): 90055.6 | learning rate: 6.035E-05 | global batch size: 2048 | lm loss: 4.484328E+00 | loss scale: 16384.0 | grad norm: 13828.124 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1106/ 292968 | consumed samples: 2265088 | consumed tokens: 205275136 | elapsed time per iteration (ms): 92247.0 | learning rate: 6.040E-05 | global batch size: 2048 | lm loss: 4.481535E+00 | loss scale: 16384.0 | grad norm: 13118.742 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1107/ 292968 | consumed samples: 2267136 | consumed tokens: 205520896 | elapsed time per iteration (ms): 94208.8 | learning rate: 6.046E-05 | global batch size: 2048 | lm loss: 4.484448E+00 | loss scale: 16384.0 | grad norm: 11851.064 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1108/ 292968 | consumed samples: 2269184 | consumed tokens: 205766656 | elapsed time per iteration (ms): 91166.3 | learning rate: 6.051E-05 | global batch size: 2048 | lm loss: 4.499976E+00 | loss scale: 16384.0 | grad norm: 12946.673 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1109/ 292968 | consumed samples: 2271232 | consumed tokens: 206012416 | elapsed time per iteration (ms): 90733.1 | learning rate: 6.057E-05 | global batch size: 2048 | lm loss: 4.495043E+00 | loss scale: 16384.0 | grad norm: 14410.823 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1110/ 292968 | consumed samples: 2273280 | consumed tokens: 206258176 | elapsed time per iteration (ms): 89090.5 | learning rate: 6.062E-05 | global batch size: 2048 | lm loss: 4.502302E+00 | loss scale: 16384.0 | grad norm: 13941.163 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1111/ 292968 | consumed samples: 2275328 | consumed tokens: 206503936 | elapsed time per iteration (ms): 86854.9 | learning rate: 6.068E-05 | global batch size: 2048 | lm loss: 4.500597E+00 | loss scale: 16384.0 | grad norm: 12233.647 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1112/ 292968 | consumed samples: 2277376 | consumed tokens: 206749696 | elapsed time per iteration (ms): 85022.5 | learning rate: 6.073E-05 | global batch size: 2048 | lm loss: 4.517644E+00 | loss scale: 16384.0 | grad norm: 13233.556 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1113/ 292968 | consumed samples: 2279424 | consumed tokens: 206995456 | elapsed time per iteration (ms): 86262.7 | learning rate: 6.078E-05 | global batch size: 2048 | lm loss: 4.487082E+00 | loss scale: 16384.0 | grad norm: 12106.235 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1114/ 292968 | consumed samples: 2281472 | consumed tokens: 207241216 | elapsed time per iteration (ms): 84996.9 | learning rate: 6.084E-05 | global batch size: 2048 | lm loss: 4.503507E+00 | loss scale: 16384.0 | grad norm: 10487.955 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1115/ 292968 | consumed samples: 2283520 | consumed tokens: 207486976 | elapsed time per iteration (ms): 84269.7 | learning rate: 6.089E-05 | global batch size: 2048 | lm loss: 4.505131E+00 | loss scale: 16384.0 | grad norm: 14373.258 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1116/ 292968 | consumed samples: 2285568 | consumed tokens: 207732736 | elapsed time per iteration (ms): 88293.0 | learning rate: 6.095E-05 | global batch size: 2048 | lm loss: 4.518019E+00 | loss scale: 16384.0 | grad norm: 14407.661 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1117/ 292968 | consumed samples: 2287616 | consumed tokens: 207978496 | elapsed time per iteration (ms): 101083.3 | learning rate: 6.100E-05 | global batch size: 2048 | lm loss: 4.499104E+00 | loss scale: 16384.0 | grad norm: 13577.662 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1118/ 292968 | consumed samples: 2289664 | consumed tokens: 208224256 | elapsed time per iteration (ms): 101950.5 | learning rate: 6.106E-05 | global batch size: 2048 | lm loss: 4.470523E+00 | loss scale: 16384.0 | grad norm: 12582.243 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1119/ 292968 | consumed samples: 2291712 | consumed tokens: 208470016 | elapsed time per iteration (ms): 100545.3 | learning rate: 6.111E-05 | global batch size: 2048 | lm loss: 4.511635E+00 | loss scale: 16384.0 | grad norm: 12043.770 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1120/ 292968 | consumed samples: 2293760 | consumed tokens: 208715776 | elapsed time per iteration (ms): 98941.1 | learning rate: 6.117E-05 | global batch size: 2048 | lm loss: 4.480804E+00 | loss scale: 16384.0 | grad norm: 13261.132 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1121/ 292968 | consumed samples: 2295808 | consumed tokens: 208961536 | elapsed time per iteration (ms): 98500.9 | learning rate: 6.122E-05 | global batch size: 2048 | lm loss: 4.481951E+00 | loss scale: 16384.0 | grad norm: 12552.504 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1122/ 292968 | consumed samples: 2297856 | consumed tokens: 209207296 | elapsed time per iteration (ms): 100829.8 | learning rate: 6.128E-05 | global batch size: 2048 | lm loss: 4.469101E+00 | loss scale: 16384.0 | grad norm: 9809.397 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1123/ 292968 | consumed samples: 2299904 | consumed tokens: 209453056 | elapsed time per iteration (ms): 103389.9 | learning rate: 6.133E-05 | global batch size: 2048 | lm loss: 4.454999E+00 | loss scale: 16384.0 | grad norm: 10922.365 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1124/ 292968 | consumed samples: 2301952 | consumed tokens: 209698816 | elapsed time per iteration (ms): 93724.8 | learning rate: 6.139E-05 | global batch size: 2048 | lm loss: 4.505367E+00 | loss scale: 16384.0 | grad norm: 11856.912 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1125/ 292968 | consumed samples: 2304000 | consumed tokens: 209944576 | elapsed time per iteration (ms): 84843.7 | learning rate: 6.144E-05 | global batch size: 2048 | lm loss: 4.477328E+00 | loss scale: 16384.0 | grad norm: 12093.303 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1126/ 292968 | consumed samples: 2306048 | consumed tokens: 210190336 | elapsed time per iteration (ms): 87356.5 | learning rate: 6.149E-05 | global batch size: 2048 | lm loss: 4.476051E+00 | loss scale: 16384.0 | grad norm: 12555.557 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1127/ 292968 | consumed samples: 2308096 | consumed tokens: 210436096 | elapsed time per iteration (ms): 89973.6 | learning rate: 6.155E-05 | global batch size: 2048 | lm loss: 4.458952E+00 | loss scale: 16384.0 | grad norm: 10239.670 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1128/ 292968 | consumed samples: 2310144 | consumed tokens: 210681856 | elapsed time per iteration (ms): 90691.1 | learning rate: 6.160E-05 | global batch size: 2048 | lm loss: 4.504097E+00 | loss scale: 16384.0 | grad norm: 9880.113 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1129/ 292968 | consumed samples: 2312192 | consumed tokens: 210927616 | elapsed time per iteration (ms): 92646.0 | learning rate: 6.166E-05 | global batch size: 2048 | lm loss: 4.479900E+00 | loss scale: 16384.0 | grad norm: 11519.475 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1130/ 292968 | consumed samples: 2314240 | consumed tokens: 211173376 | elapsed time per iteration (ms): 88691.6 | learning rate: 6.171E-05 | global batch size: 2048 | lm loss: 4.446621E+00 | loss scale: 16384.0 | grad norm: 10702.181 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1131/ 292968 | consumed samples: 2316288 | consumed tokens: 211419136 | elapsed time per iteration (ms): 88771.9 | learning rate: 6.177E-05 | global batch size: 2048 | lm loss: 4.428393E+00 | loss scale: 16384.0 | grad norm: 11272.416 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1132/ 292968 | consumed samples: 2318336 | consumed tokens: 211664896 | elapsed time per iteration (ms): 90562.2 | learning rate: 6.182E-05 | global batch size: 2048 | lm loss: 4.474543E+00 | loss scale: 16384.0 | grad norm: 15468.855 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1133/ 292968 | consumed samples: 2320384 | consumed tokens: 211910656 | elapsed time per iteration (ms): 93483.0 | learning rate: 6.188E-05 | global batch size: 2048 | lm loss: 4.508697E+00 | loss scale: 16384.0 | grad norm: 18611.867 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1134/ 292968 | consumed samples: 2322432 | consumed tokens: 212156416 | elapsed time per iteration (ms): 83877.9 | learning rate: 6.193E-05 | global batch size: 2048 | lm loss: 4.506527E+00 | loss scale: 16384.0 | grad norm: 13665.538 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1135/ 292968 | consumed samples: 2324480 | consumed tokens: 212402176 | elapsed time per iteration (ms): 84242.1 | learning rate: 6.199E-05 | global batch size: 2048 | lm loss: 4.490401E+00 | loss scale: 16384.0 | grad norm: 16179.505 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1136/ 292968 | consumed samples: 2326528 | consumed tokens: 212647936 | elapsed time per iteration (ms): 82968.9 | learning rate: 6.204E-05 | global batch size: 2048 | lm loss: 4.472262E+00 | loss scale: 16384.0 | grad norm: 15997.198 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1137/ 292968 | consumed samples: 2328576 | consumed tokens: 212893696 | elapsed time per iteration (ms): 87964.9 | learning rate: 6.210E-05 | global batch size: 2048 | lm loss: 4.472732E+00 | loss scale: 16384.0 | grad norm: 12482.858 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1138/ 292968 | consumed samples: 2330624 | consumed tokens: 213139456 | elapsed time per iteration (ms): 87058.3 | learning rate: 6.215E-05 | global batch size: 2048 | lm loss: 4.475842E+00 | loss scale: 16384.0 | grad norm: 15157.091 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1139/ 292968 | consumed samples: 2332672 | consumed tokens: 213385216 | elapsed time per iteration (ms): 85216.4 | learning rate: 6.220E-05 | global batch size: 2048 | lm loss: 4.482242E+00 | loss scale: 16384.0 | grad norm: 16168.095 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1140/ 292968 | consumed samples: 2334720 | consumed tokens: 213630976 | elapsed time per iteration (ms): 84317.7 | learning rate: 6.226E-05 | global batch size: 2048 | lm loss: 4.454989E+00 | loss scale: 16384.0 | grad norm: 13895.017 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1141/ 292968 | consumed samples: 2336768 | consumed tokens: 213876736 | elapsed time per iteration (ms): 83268.7 | learning rate: 6.231E-05 | global batch size: 2048 | lm loss: 4.475907E+00 | loss scale: 16384.0 | grad norm: 13531.071 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1142/ 292968 | consumed samples: 2338816 | consumed tokens: 214122496 | elapsed time per iteration (ms): 84086.7 | learning rate: 6.237E-05 | global batch size: 2048 | lm loss: 4.450652E+00 | loss scale: 16384.0 | grad norm: 13514.029 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1143/ 292968 | consumed samples: 2340864 | consumed tokens: 214368256 | elapsed time per iteration (ms): 83901.4 | learning rate: 6.242E-05 | global batch size: 2048 | lm loss: 4.436163E+00 | loss scale: 16384.0 | grad norm: 13077.640 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1144/ 292968 | consumed samples: 2342912 | consumed tokens: 214614016 | elapsed time per iteration (ms): 92021.8 | learning rate: 6.248E-05 | global batch size: 2048 | lm loss: 4.420115E+00 | loss scale: 16384.0 | grad norm: 9967.862 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1145/ 292968 | consumed samples: 2344960 | consumed tokens: 214859776 | elapsed time per iteration (ms): 85183.5 | learning rate: 6.253E-05 | global batch size: 2048 | lm loss: 4.453631E+00 | loss scale: 16384.0 | grad norm: 9284.835 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1146/ 292968 | consumed samples: 2347008 | consumed tokens: 215105536 | elapsed time per iteration (ms): 80509.5 | learning rate: 6.259E-05 | global batch size: 2048 | lm loss: 4.448218E+00 | loss scale: 16384.0 | grad norm: 11240.608 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1147/ 292968 | consumed samples: 2349056 | consumed tokens: 215351296 | elapsed time per iteration (ms): 81944.8 | learning rate: 6.264E-05 | global batch size: 2048 | lm loss: 4.446771E+00 | loss scale: 16384.0 | grad norm: 13038.998 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1148/ 292968 | consumed samples: 2351104 | consumed tokens: 215597056 | elapsed time per iteration (ms): 80348.5 | learning rate: 6.270E-05 | global batch size: 2048 | lm loss: 4.452250E+00 | loss scale: 16384.0 | grad norm: 11499.513 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1149/ 292968 | consumed samples: 2353152 | consumed tokens: 215842816 | elapsed time per iteration (ms): 84665.5 | learning rate: 6.275E-05 | global batch size: 2048 | lm loss: 4.448427E+00 | loss scale: 16384.0 | grad norm: 11235.186 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1150/ 292968 | consumed samples: 2355200 | consumed tokens: 216088576 | elapsed time per iteration (ms): 87862.6 | learning rate: 6.281E-05 | global batch size: 2048 | lm loss: 4.470460E+00 | loss scale: 16384.0 | grad norm: 17633.464 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1151/ 292968 | consumed samples: 2357248 | consumed tokens: 216334336 | elapsed time per iteration (ms): 91071.6 | learning rate: 6.286E-05 | global batch size: 2048 | lm loss: 4.453492E+00 | loss scale: 16384.0 | grad norm: 21667.478 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1152/ 292968 | consumed samples: 2359296 | consumed tokens: 216580096 | elapsed time per iteration (ms): 87731.9 | learning rate: 6.291E-05 | global batch size: 2048 | lm loss: 4.454962E+00 | loss scale: 16384.0 | grad norm: 11102.499 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1153/ 292968 | consumed samples: 2361344 | consumed tokens: 216825856 | elapsed time per iteration (ms): 86175.2 | learning rate: 6.297E-05 | global batch size: 2048 | lm loss: 4.472691E+00 | loss scale: 16384.0 | grad norm: 16589.243 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1154/ 292968 | consumed samples: 2363392 | consumed tokens: 217071616 | elapsed time per iteration (ms): 86787.4 | learning rate: 6.302E-05 | global batch size: 2048 | lm loss: 4.440430E+00 | loss scale: 16384.0 | grad norm: 14527.168 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1155/ 292968 | consumed samples: 2365440 | consumed tokens: 217317376 | elapsed time per iteration (ms): 89509.0 | learning rate: 6.308E-05 | global batch size: 2048 | lm loss: 4.453074E+00 | loss scale: 16384.0 | grad norm: 11873.027 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1156/ 292968 | consumed samples: 2367488 | consumed tokens: 217563136 | elapsed time per iteration (ms): 88188.0 | learning rate: 6.313E-05 | global batch size: 2048 | lm loss: 4.437817E+00 | loss scale: 16384.0 | grad norm: 11356.202 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1157/ 292968 | consumed samples: 2369536 | consumed tokens: 217808896 | elapsed time per iteration (ms): 89816.8 | learning rate: 6.319E-05 | global batch size: 2048 | lm loss: 4.474587E+00 | loss scale: 16384.0 | grad norm: 13801.132 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1158/ 292968 | consumed samples: 2371584 | consumed tokens: 218054656 | elapsed time per iteration (ms): 85464.5 | learning rate: 6.324E-05 | global batch size: 2048 | lm loss: 4.457763E+00 | loss scale: 16384.0 | grad norm: 16588.132 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1159/ 292968 | consumed samples: 2373632 | consumed tokens: 218300416 | elapsed time per iteration (ms): 88186.8 | learning rate: 6.330E-05 | global batch size: 2048 | lm loss: 4.483557E+00 | loss scale: 16384.0 | grad norm: 14769.798 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1160/ 292968 | consumed samples: 2375680 | consumed tokens: 218546176 | elapsed time per iteration (ms): 82144.8 | learning rate: 6.335E-05 | global batch size: 2048 | lm loss: 4.449202E+00 | loss scale: 16384.0 | grad norm: 11017.962 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1161/ 292968 | consumed samples: 2377728 | consumed tokens: 218791936 | elapsed time per iteration (ms): 83289.6 | learning rate: 6.341E-05 | global batch size: 2048 | lm loss: 4.423344E+00 | loss scale: 16384.0 | grad norm: 11202.773 | num zeros: 0.0 | curriculum seqlen: 120 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1162/ 292968 | consumed samples: 2379776 | consumed tokens: 219054080 | elapsed time per iteration (ms): 83192.9 | learning rate: 6.346E-05 | global batch size: 2048 | lm loss: 4.544310E+00 | loss scale: 16384.0 | grad norm: 18433.308 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1163/ 292968 | consumed samples: 2381824 | consumed tokens: 219316224 | elapsed time per iteration (ms): 82279.6 | learning rate: 6.352E-05 | global batch size: 2048 | lm loss: 4.501222E+00 | loss scale: 16384.0 | grad norm: 17054.890 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1164/ 292968 | consumed samples: 2383872 | consumed tokens: 219578368 | elapsed time per iteration (ms): 81750.0 | learning rate: 6.357E-05 | global batch size: 2048 | lm loss: 4.517543E+00 | loss scale: 16384.0 | grad norm: 20929.495 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1165/ 292968 | consumed samples: 2385920 | consumed tokens: 219840512 | elapsed time per iteration (ms): 81752.4 | learning rate: 6.362E-05 | global batch size: 2048 | lm loss: 4.540401E+00 | loss scale: 16384.0 | grad norm: 13879.199 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1166/ 292968 | consumed samples: 2387968 | consumed tokens: 220102656 | elapsed time per iteration (ms): 82334.6 | learning rate: 6.368E-05 | global batch size: 2048 | lm loss: 4.525122E+00 | loss scale: 16384.0 | grad norm: 16822.318 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1167/ 292968 | consumed samples: 2390016 | consumed tokens: 220364800 | elapsed time per iteration (ms): 83774.4 | learning rate: 6.373E-05 | global batch size: 2048 | lm loss: 4.509167E+00 | loss scale: 16384.0 | grad norm: 17342.147 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1168/ 292968 | consumed samples: 2392064 | consumed tokens: 220626944 | elapsed time per iteration (ms): 83587.9 | learning rate: 6.379E-05 | global batch size: 2048 | lm loss: 4.517789E+00 | loss scale: 16384.0 | grad norm: 16292.729 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1169/ 292968 | consumed samples: 2394112 | consumed tokens: 220889088 | elapsed time per iteration (ms): 82380.4 | learning rate: 6.384E-05 | global batch size: 2048 | lm loss: 4.466714E+00 | loss scale: 16384.0 | grad norm: 12805.022 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1170/ 292968 | consumed samples: 2396160 | consumed tokens: 221151232 | elapsed time per iteration (ms): 85945.4 | learning rate: 6.390E-05 | global batch size: 2048 | lm loss: 4.475655E+00 | loss scale: 16384.0 | grad norm: 12161.540 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1171/ 292968 | consumed samples: 2398208 | consumed tokens: 221413376 | elapsed time per iteration (ms): 88588.2 | learning rate: 6.395E-05 | global batch size: 2048 | lm loss: 4.475016E+00 | loss scale: 16384.0 | grad norm: 11806.118 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1172/ 292968 | consumed samples: 2400256 | consumed tokens: 221675520 | elapsed time per iteration (ms): 95985.7 | learning rate: 6.401E-05 | global batch size: 2048 | lm loss: 4.467658E+00 | loss scale: 16384.0 | grad norm: 11612.126 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1173/ 292968 | consumed samples: 2402304 | consumed tokens: 221937664 | elapsed time per iteration (ms): 87312.2 | learning rate: 6.406E-05 | global batch size: 2048 | lm loss: 4.444437E+00 | loss scale: 16384.0 | grad norm: 8432.213 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1174/ 292968 | consumed samples: 2404352 | consumed tokens: 222199808 | elapsed time per iteration (ms): 85322.4 | learning rate: 6.412E-05 | global batch size: 2048 | lm loss: 4.444757E+00 | loss scale: 16384.0 | grad norm: 7541.112 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1175/ 292968 | consumed samples: 2406400 | consumed tokens: 222461952 | elapsed time per iteration (ms): 83411.9 | learning rate: 6.417E-05 | global batch size: 2048 | lm loss: 4.476314E+00 | loss scale: 16384.0 | grad norm: 8004.432 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1176/ 292968 | consumed samples: 2408448 | consumed tokens: 222724096 | elapsed time per iteration (ms): 82953.2 | learning rate: 6.423E-05 | global batch size: 2048 | lm loss: 4.446434E+00 | loss scale: 16384.0 | grad norm: 8909.883 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1177/ 292968 | consumed samples: 2410496 | consumed tokens: 222986240 | elapsed time per iteration (ms): 87868.2 | learning rate: 6.428E-05 | global batch size: 2048 | lm loss: 4.432823E+00 | loss scale: 16384.0 | grad norm: 8815.369 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1178/ 292968 | consumed samples: 2412544 | consumed tokens: 223248384 | elapsed time per iteration (ms): 92286.4 | learning rate: 6.433E-05 | global batch size: 2048 | lm loss: 4.440416E+00 | loss scale: 16384.0 | grad norm: 8249.604 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1179/ 292968 | consumed samples: 2414592 | consumed tokens: 223510528 | elapsed time per iteration (ms): 85012.5 | learning rate: 6.439E-05 | global batch size: 2048 | lm loss: 4.435045E+00 | loss scale: 16384.0 | grad norm: 13031.257 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1180/ 292968 | consumed samples: 2416640 | consumed tokens: 223772672 | elapsed time per iteration (ms): 84404.1 | learning rate: 6.444E-05 | global batch size: 2048 | lm loss: 4.449515E+00 | loss scale: 16384.0 | grad norm: 15463.512 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1181/ 292968 | consumed samples: 2418688 | consumed tokens: 224034816 | elapsed time per iteration (ms): 82794.2 | learning rate: 6.450E-05 | global batch size: 2048 | lm loss: 4.443280E+00 | loss scale: 16384.0 | grad norm: 12721.791 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1182/ 292968 | consumed samples: 2420736 | consumed tokens: 224296960 | elapsed time per iteration (ms): 80915.4 | learning rate: 6.455E-05 | global batch size: 2048 | lm loss: 4.428095E+00 | loss scale: 16384.0 | grad norm: 14710.674 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1183/ 292968 | consumed samples: 2422784 | consumed tokens: 224559104 | elapsed time per iteration (ms): 82279.8 | learning rate: 6.461E-05 | global batch size: 2048 | lm loss: 4.443545E+00 | loss scale: 16384.0 | grad norm: 12937.139 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1184/ 292968 | consumed samples: 2424832 | consumed tokens: 224821248 | elapsed time per iteration (ms): 81833.3 | learning rate: 6.466E-05 | global batch size: 2048 | lm loss: 4.385079E+00 | loss scale: 16384.0 | grad norm: 10797.823 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1185/ 292968 | consumed samples: 2426880 | consumed tokens: 225083392 | elapsed time per iteration (ms): 82539.1 | learning rate: 6.472E-05 | global batch size: 2048 | lm loss: 4.400814E+00 | loss scale: 16384.0 | grad norm: 12589.320 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1186/ 292968 | consumed samples: 2428928 | consumed tokens: 225345536 | elapsed time per iteration (ms): 81719.3 | learning rate: 6.477E-05 | global batch size: 2048 | lm loss: 4.399818E+00 | loss scale: 16384.0 | grad norm: 13407.551 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1187/ 292968 | consumed samples: 2430976 | consumed tokens: 225607680 | elapsed time per iteration (ms): 82001.5 | learning rate: 6.483E-05 | global batch size: 2048 | lm loss: 4.391018E+00 | loss scale: 16384.0 | grad norm: 14728.589 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1188/ 292968 | consumed samples: 2433024 | consumed tokens: 225869824 | elapsed time per iteration (ms): 82341.4 | learning rate: 6.488E-05 | global batch size: 2048 | lm loss: 4.435332E+00 | loss scale: 16384.0 | grad norm: 16077.369 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1189/ 292968 | consumed samples: 2435072 | consumed tokens: 226131968 | elapsed time per iteration (ms): 81553.5 | learning rate: 6.494E-05 | global batch size: 2048 | lm loss: 4.426288E+00 | loss scale: 16384.0 | grad norm: 15655.135 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1190/ 292968 | consumed samples: 2437120 | consumed tokens: 226394112 | elapsed time per iteration (ms): 80300.7 | learning rate: 6.499E-05 | global batch size: 2048 | lm loss: 4.436830E+00 | loss scale: 16384.0 | grad norm: 12006.628 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1191/ 292968 | consumed samples: 2439168 | consumed tokens: 226656256 | elapsed time per iteration (ms): 82008.2 | learning rate: 6.504E-05 | global batch size: 2048 | lm loss: 4.403228E+00 | loss scale: 16384.0 | grad norm: 9975.802 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1192/ 292968 | consumed samples: 2441216 | consumed tokens: 226918400 | elapsed time per iteration (ms): 81322.0 | learning rate: 6.510E-05 | global batch size: 2048 | lm loss: 4.408382E+00 | loss scale: 16384.0 | grad norm: 13007.168 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1193/ 292968 | consumed samples: 2443264 | consumed tokens: 227180544 | elapsed time per iteration (ms): 81446.6 | learning rate: 6.515E-05 | global batch size: 2048 | lm loss: 4.408079E+00 | loss scale: 16384.0 | grad norm: 14048.280 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1194/ 292968 | consumed samples: 2445312 | consumed tokens: 227442688 | elapsed time per iteration (ms): 81375.0 | learning rate: 6.521E-05 | global batch size: 2048 | lm loss: 4.423388E+00 | loss scale: 16384.0 | grad norm: 13199.497 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1195/ 292968 | consumed samples: 2447360 | consumed tokens: 227704832 | elapsed time per iteration (ms): 83516.5 | learning rate: 6.526E-05 | global batch size: 2048 | lm loss: 4.399532E+00 | loss scale: 16384.0 | grad norm: 13195.438 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1196/ 292968 | consumed samples: 2449408 | consumed tokens: 227966976 | elapsed time per iteration (ms): 81501.2 | learning rate: 6.532E-05 | global batch size: 2048 | lm loss: 4.389182E+00 | loss scale: 16384.0 | grad norm: 14781.582 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1197/ 292968 | consumed samples: 2451456 | consumed tokens: 228229120 | elapsed time per iteration (ms): 81359.1 | learning rate: 6.537E-05 | global batch size: 2048 | lm loss: 4.393048E+00 | loss scale: 16384.0 | grad norm: 11690.775 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1198/ 292968 | consumed samples: 2453504 | consumed tokens: 228491264 | elapsed time per iteration (ms): 82290.9 | learning rate: 6.543E-05 | global batch size: 2048 | lm loss: 4.375069E+00 | loss scale: 16384.0 | grad norm: 11151.430 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1199/ 292968 | consumed samples: 2455552 | consumed tokens: 228753408 | elapsed time per iteration (ms): 86665.6 | learning rate: 6.548E-05 | global batch size: 2048 | lm loss: 4.375456E+00 | loss scale: 16384.0 | grad norm: 10040.699 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1200/ 292968 | consumed samples: 2457600 | consumed tokens: 229015552 | elapsed time per iteration (ms): 85666.1 | learning rate: 6.554E-05 | global batch size: 2048 | lm loss: 4.438442E+00 | loss scale: 16384.0 | grad norm: 17645.529 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 1200 | lm loss value: 4.390853E+00 | lm loss PPL: 8.070923E+01 | ------------------------------------------------------------------------------------------------- -saving checkpoint at iteration 1200 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-26 08:22:14,294] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/mp_rank_01_model_states.pt -[2021-10-26 08:22:14,567] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/mp_rank_00_model_states.pt -[2021-10-26 08:22:27,278] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-26 08:22:27,333] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-26 08:22:27,337] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-26 08:22:27,356] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-26 08:22:27,374] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-26 08:22:27,409] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-26 08:22:27,457] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-26 08:22:27,467] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-26 08:22:27,522] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-26 08:22:27,576] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-26 08:22:27,612] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-26 08:22:27,626] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-26 08:22:27,644] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-26 08:22:27,645] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-26 08:22:27,649] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-26 08:22:27,666] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-26 08:22:27,682] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-26 08:22:27,741] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-26 08:22:27,742] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-26 08:22:27,806] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-26 08:22:27,812] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-26 08:22:27,842] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-26 08:22:27,881] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-26 08:22:27,882] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-26 08:22:27,899] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-26 08:22:27,906] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-26 08:22:27,926] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-26 08:22:28,015] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-26 08:22:28,097] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-26 08:22:28,103] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-26 08:22:28,223] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-26 08:22:28,226] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-26 08:22:28,312] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-26 08:22:28,323] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-26 08:22:28,345] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-26 08:22:28,349] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-26 08:22:28,392] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-26 08:22:28,472] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-26 08:22:28,507] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-26 08:22:28,508] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-26 08:22:28,561] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-26 08:22:28,614] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-26 08:22:28,645] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-26 08:22:28,647] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-26 08:22:28,664] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-26 08:22:28,668] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-26 08:22:28,672] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-26 08:22:28,684] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-26 08:22:28,704] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-26 08:22:28,713] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-26 08:22:28,723] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-26 08:22:28,725] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-26 08:22:28,743] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-26 08:22:28,751] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-26 08:22:28,757] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-26 08:22:28,759] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-26 08:22:28,760] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-26 08:22:28,772] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-26 08:22:28,776] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-26 08:22:28,781] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-26 08:22:28,784] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-26 08:22:28,784] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-26 08:22:28,800] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-26 08:22:28,808] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-26 08:22:28,818] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-26 08:22:28,820] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-26 08:22:28,821] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-26 08:22:28,840] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-26 08:22:28,847] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-26 08:22:28,858] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-26 08:22:28,867] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-26 08:22:28,871] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-26 08:22:28,887] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-26 08:22:28,933] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-26 08:22:28,940] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-26 08:22:28,943] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-26 08:22:28,974] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-26 08:22:28,977] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-26 08:22:28,985] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-26 08:22:29,021] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-26 08:22:29,033] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-26 08:22:29,126] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-26 08:22:29,168] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-26 08:22:29,180] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-26 08:22:29,198] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-26 08:22:29,223] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-26 08:22:29,234] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-26 08:22:29,260] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-26 08:22:29,270] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-26 08:22:29,273] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-26 08:22:29,281] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-26 08:22:29,297] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-26 08:22:29,306] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-26 08:22:29,323] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-26 08:22:29,439] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-26 08:22:29,460] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-26 08:22:29,489] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-26 08:22:29,550] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-26 08:22:29,560] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-26 08:22:29,575] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-26 08:22:29,579] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-26 08:22:29,585] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-26 08:22:29,608] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-26 08:22:29,634] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-26 08:22:29,705] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-26 08:22:29,706] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-26 08:22:29,717] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-26 08:22:29,783] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-26 08:22:29,787] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-26 08:22:29,799] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-26 08:22:29,890] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-26 08:22:30,063] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-26 08:22:30,105] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-26 08:22:30,181] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-26 08:22:30,209] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-26 08:22:30,510] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-26 08:22:31,467] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-26 08:22:31,636] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-26 08:22:31,640] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-26 08:22:31,722] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-26 08:22:32,165] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-26 08:22:32,561] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-26 08:22:33,191] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-26 08:22:34,148] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-26 08:22:34,775] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-26 08:22:35,166] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-26 08:22:35,926] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-26 08:22:36,729] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1200/zero_pp_rank_0_mp_rank_17_optim_states.pt - successfully saved checkpoint at iteration 1200 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 25223.29 - iteration 1201/ 292968 | consumed samples: 2459648 | consumed tokens: 229277696 | elapsed time per iteration (ms): 292166.7 | learning rate: 6.559E-05 | global batch size: 2048 | lm loss: 4.396831E+00 | loss scale: 16384.0 | grad norm: 22874.480 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1202/ 292968 | consumed samples: 2461696 | consumed tokens: 229539840 | elapsed time per iteration (ms): 82609.1 | learning rate: 6.565E-05 | global batch size: 2048 | lm loss: 4.400014E+00 | loss scale: 16384.0 | grad norm: 11497.666 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1203/ 292968 | consumed samples: 2463744 | consumed tokens: 229801984 | elapsed time per iteration (ms): 87960.1 | learning rate: 6.570E-05 | global batch size: 2048 | lm loss: 4.410713E+00 | loss scale: 16384.0 | grad norm: 18418.275 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1204/ 292968 | consumed samples: 2465792 | consumed tokens: 230064128 | elapsed time per iteration (ms): 96182.2 | learning rate: 6.575E-05 | global batch size: 2048 | lm loss: 4.383131E+00 | loss scale: 16384.0 | grad norm: 12132.603 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1205/ 292968 | consumed samples: 2467840 | consumed tokens: 230326272 | elapsed time per iteration (ms): 96399.2 | learning rate: 6.581E-05 | global batch size: 2048 | lm loss: 4.372148E+00 | loss scale: 16384.0 | grad norm: 10641.085 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1206/ 292968 | consumed samples: 2469888 | consumed tokens: 230588416 | elapsed time per iteration (ms): 92673.1 | learning rate: 6.586E-05 | global batch size: 2048 | lm loss: 4.381207E+00 | loss scale: 16384.0 | grad norm: 9952.677 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1207/ 292968 | consumed samples: 2471936 | consumed tokens: 230850560 | elapsed time per iteration (ms): 87223.9 | learning rate: 6.592E-05 | global batch size: 2048 | lm loss: 4.394568E+00 | loss scale: 16384.0 | grad norm: 10581.024 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1208/ 292968 | consumed samples: 2473984 | consumed tokens: 231112704 | elapsed time per iteration (ms): 86492.0 | learning rate: 6.597E-05 | global batch size: 2048 | lm loss: 4.399185E+00 | loss scale: 16384.0 | grad norm: 12296.673 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1209/ 292968 | consumed samples: 2476032 | consumed tokens: 231374848 | elapsed time per iteration (ms): 83425.7 | learning rate: 6.603E-05 | global batch size: 2048 | lm loss: 4.400098E+00 | loss scale: 16384.0 | grad norm: 15118.336 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1210/ 292968 | consumed samples: 2478080 | consumed tokens: 231636992 | elapsed time per iteration (ms): 80058.6 | learning rate: 6.608E-05 | global batch size: 2048 | lm loss: 4.397508E+00 | loss scale: 16384.0 | grad norm: 14987.338 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1211/ 292968 | consumed samples: 2480128 | consumed tokens: 231899136 | elapsed time per iteration (ms): 83039.9 | learning rate: 6.614E-05 | global batch size: 2048 | lm loss: 4.394114E+00 | loss scale: 16384.0 | grad norm: 12769.539 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1212/ 292968 | consumed samples: 2482176 | consumed tokens: 232161280 | elapsed time per iteration (ms): 80939.7 | learning rate: 6.619E-05 | global batch size: 2048 | lm loss: 4.413101E+00 | loss scale: 16384.0 | grad norm: 13665.086 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1213/ 292968 | consumed samples: 2484224 | consumed tokens: 232423424 | elapsed time per iteration (ms): 80147.6 | learning rate: 6.625E-05 | global batch size: 2048 | lm loss: 4.424644E+00 | loss scale: 16384.0 | grad norm: 16107.055 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1214/ 292968 | consumed samples: 2486272 | consumed tokens: 232685568 | elapsed time per iteration (ms): 82732.3 | learning rate: 6.630E-05 | global batch size: 2048 | lm loss: 4.410405E+00 | loss scale: 16384.0 | grad norm: 13795.403 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1215/ 292968 | consumed samples: 2488320 | consumed tokens: 232947712 | elapsed time per iteration (ms): 80741.7 | learning rate: 6.636E-05 | global batch size: 2048 | lm loss: 4.428461E+00 | loss scale: 16384.0 | grad norm: 10623.169 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1216/ 292968 | consumed samples: 2490368 | consumed tokens: 233209856 | elapsed time per iteration (ms): 80900.1 | learning rate: 6.641E-05 | global batch size: 2048 | lm loss: 4.364405E+00 | loss scale: 16384.0 | grad norm: 8287.667 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1217/ 292968 | consumed samples: 2492416 | consumed tokens: 233472000 | elapsed time per iteration (ms): 80076.2 | learning rate: 6.646E-05 | global batch size: 2048 | lm loss: 4.374700E+00 | loss scale: 16384.0 | grad norm: 9858.643 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1218/ 292968 | consumed samples: 2494464 | consumed tokens: 233734144 | elapsed time per iteration (ms): 82874.8 | learning rate: 6.652E-05 | global batch size: 2048 | lm loss: 4.363934E+00 | loss scale: 16384.0 | grad norm: 9868.903 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1219/ 292968 | consumed samples: 2496512 | consumed tokens: 233996288 | elapsed time per iteration (ms): 82405.9 | learning rate: 6.657E-05 | global batch size: 2048 | lm loss: 4.357006E+00 | loss scale: 16384.0 | grad norm: 7732.803 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1220/ 292968 | consumed samples: 2498560 | consumed tokens: 234258432 | elapsed time per iteration (ms): 82837.0 | learning rate: 6.663E-05 | global batch size: 2048 | lm loss: 4.378250E+00 | loss scale: 16384.0 | grad norm: 6599.288 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1221/ 292968 | consumed samples: 2500608 | consumed tokens: 234520576 | elapsed time per iteration (ms): 82640.8 | learning rate: 6.668E-05 | global batch size: 2048 | lm loss: 4.361965E+00 | loss scale: 16384.0 | grad norm: 7286.991 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1222/ 292968 | consumed samples: 2502656 | consumed tokens: 234782720 | elapsed time per iteration (ms): 78584.9 | learning rate: 6.674E-05 | global batch size: 2048 | lm loss: 4.370571E+00 | loss scale: 16384.0 | grad norm: 10202.523 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1223/ 292968 | consumed samples: 2504704 | consumed tokens: 235044864 | elapsed time per iteration (ms): 80506.0 | learning rate: 6.679E-05 | global batch size: 2048 | lm loss: 4.379625E+00 | loss scale: 16384.0 | grad norm: 15200.100 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1224/ 292968 | consumed samples: 2506752 | consumed tokens: 235307008 | elapsed time per iteration (ms): 81030.9 | learning rate: 6.685E-05 | global batch size: 2048 | lm loss: 4.370178E+00 | loss scale: 16384.0 | grad norm: 14628.530 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1225/ 292968 | consumed samples: 2508800 | consumed tokens: 235569152 | elapsed time per iteration (ms): 82454.2 | learning rate: 6.690E-05 | global batch size: 2048 | lm loss: 4.367004E+00 | loss scale: 16384.0 | grad norm: 13334.750 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1226/ 292968 | consumed samples: 2510848 | consumed tokens: 235831296 | elapsed time per iteration (ms): 83265.9 | learning rate: 6.696E-05 | global batch size: 2048 | lm loss: 4.367511E+00 | loss scale: 16384.0 | grad norm: 17692.388 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1227/ 292968 | consumed samples: 2512896 | consumed tokens: 236093440 | elapsed time per iteration (ms): 86018.3 | learning rate: 6.701E-05 | global batch size: 2048 | lm loss: 4.375439E+00 | loss scale: 16384.0 | grad norm: 13705.937 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1228/ 292968 | consumed samples: 2514944 | consumed tokens: 236355584 | elapsed time per iteration (ms): 100371.9 | learning rate: 6.707E-05 | global batch size: 2048 | lm loss: 4.393482E+00 | loss scale: 16384.0 | grad norm: 17453.825 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1229/ 292968 | consumed samples: 2516992 | consumed tokens: 236617728 | elapsed time per iteration (ms): 94718.4 | learning rate: 6.712E-05 | global batch size: 2048 | lm loss: 4.401050E+00 | loss scale: 16384.0 | grad norm: 19996.157 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1230/ 292968 | consumed samples: 2519040 | consumed tokens: 236879872 | elapsed time per iteration (ms): 92427.2 | learning rate: 6.717E-05 | global batch size: 2048 | lm loss: 4.389038E+00 | loss scale: 16384.0 | grad norm: 10066.272 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1231/ 292968 | consumed samples: 2521088 | consumed tokens: 237142016 | elapsed time per iteration (ms): 93164.2 | learning rate: 6.723E-05 | global batch size: 2048 | lm loss: 4.367707E+00 | loss scale: 16384.0 | grad norm: 15716.487 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1232/ 292968 | consumed samples: 2523136 | consumed tokens: 237404160 | elapsed time per iteration (ms): 100583.8 | learning rate: 6.728E-05 | global batch size: 2048 | lm loss: 4.361382E+00 | loss scale: 16384.0 | grad norm: 14122.687 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1233/ 292968 | consumed samples: 2525184 | consumed tokens: 237666304 | elapsed time per iteration (ms): 104590.4 | learning rate: 6.734E-05 | global batch size: 2048 | lm loss: 4.358156E+00 | loss scale: 16384.0 | grad norm: 11569.865 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1234/ 292968 | consumed samples: 2527232 | consumed tokens: 237928448 | elapsed time per iteration (ms): 111699.2 | learning rate: 6.739E-05 | global batch size: 2048 | lm loss: 4.359481E+00 | loss scale: 16384.0 | grad norm: 9884.000 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1235/ 292968 | consumed samples: 2529280 | consumed tokens: 238190592 | elapsed time per iteration (ms): 107678.6 | learning rate: 6.745E-05 | global batch size: 2048 | lm loss: 4.377883E+00 | loss scale: 16384.0 | grad norm: 10385.919 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1236/ 292968 | consumed samples: 2531328 | consumed tokens: 238452736 | elapsed time per iteration (ms): 91354.2 | learning rate: 6.750E-05 | global batch size: 2048 | lm loss: 4.360860E+00 | loss scale: 16384.0 | grad norm: 10722.396 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1237/ 292968 | consumed samples: 2533376 | consumed tokens: 238714880 | elapsed time per iteration (ms): 91033.7 | learning rate: 6.756E-05 | global batch size: 2048 | lm loss: 4.365729E+00 | loss scale: 16384.0 | grad norm: 10556.339 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1238/ 292968 | consumed samples: 2535424 | consumed tokens: 238977024 | elapsed time per iteration (ms): 87076.6 | learning rate: 6.761E-05 | global batch size: 2048 | lm loss: 4.382186E+00 | loss scale: 16384.0 | grad norm: 9626.425 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1239/ 292968 | consumed samples: 2537472 | consumed tokens: 239239168 | elapsed time per iteration (ms): 87669.8 | learning rate: 6.767E-05 | global batch size: 2048 | lm loss: 4.353411E+00 | loss scale: 16384.0 | grad norm: 11362.074 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1240/ 292968 | consumed samples: 2539520 | consumed tokens: 239501312 | elapsed time per iteration (ms): 88595.5 | learning rate: 6.772E-05 | global batch size: 2048 | lm loss: 4.391058E+00 | loss scale: 16384.0 | grad norm: 15878.575 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1241/ 292968 | consumed samples: 2541568 | consumed tokens: 239763456 | elapsed time per iteration (ms): 87222.8 | learning rate: 6.778E-05 | global batch size: 2048 | lm loss: 4.364265E+00 | loss scale: 16384.0 | grad norm: 14391.869 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1242/ 292968 | consumed samples: 2543616 | consumed tokens: 240025600 | elapsed time per iteration (ms): 88478.0 | learning rate: 6.783E-05 | global batch size: 2048 | lm loss: 4.350900E+00 | loss scale: 16384.0 | grad norm: 10875.616 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1243/ 292968 | consumed samples: 2545664 | consumed tokens: 240287744 | elapsed time per iteration (ms): 85038.6 | learning rate: 6.788E-05 | global batch size: 2048 | lm loss: 4.377454E+00 | loss scale: 16384.0 | grad norm: 10760.449 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1244/ 292968 | consumed samples: 2547712 | consumed tokens: 240549888 | elapsed time per iteration (ms): 87602.7 | learning rate: 6.794E-05 | global batch size: 2048 | lm loss: 4.352528E+00 | loss scale: 16384.0 | grad norm: 13228.411 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1245/ 292968 | consumed samples: 2549760 | consumed tokens: 240812032 | elapsed time per iteration (ms): 92165.3 | learning rate: 6.799E-05 | global batch size: 2048 | lm loss: 4.359108E+00 | loss scale: 16384.0 | grad norm: 13012.896 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1246/ 292968 | consumed samples: 2551808 | consumed tokens: 241074176 | elapsed time per iteration (ms): 92810.0 | learning rate: 6.805E-05 | global batch size: 2048 | lm loss: 4.369980E+00 | loss scale: 16384.0 | grad norm: 10911.241 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1247/ 292968 | consumed samples: 2553856 | consumed tokens: 241336320 | elapsed time per iteration (ms): 89252.2 | learning rate: 6.810E-05 | global batch size: 2048 | lm loss: 4.343827E+00 | loss scale: 16384.0 | grad norm: 11826.940 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1248/ 292968 | consumed samples: 2555904 | consumed tokens: 241598464 | elapsed time per iteration (ms): 93185.0 | learning rate: 6.816E-05 | global batch size: 2048 | lm loss: 4.353201E+00 | loss scale: 16384.0 | grad norm: 10993.662 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1249/ 292968 | consumed samples: 2557952 | consumed tokens: 241860608 | elapsed time per iteration (ms): 88268.1 | learning rate: 6.821E-05 | global batch size: 2048 | lm loss: 4.311706E+00 | loss scale: 16384.0 | grad norm: 9639.579 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1250/ 292968 | consumed samples: 2560000 | consumed tokens: 242122752 | elapsed time per iteration (ms): 91225.9 | learning rate: 6.827E-05 | global batch size: 2048 | lm loss: 4.368147E+00 | loss scale: 16384.0 | grad norm: 11423.006 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1251/ 292968 | consumed samples: 2562048 | consumed tokens: 242384896 | elapsed time per iteration (ms): 99360.7 | learning rate: 6.832E-05 | global batch size: 2048 | lm loss: 4.358833E+00 | loss scale: 16384.0 | grad norm: 11737.250 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1252/ 292968 | consumed samples: 2564096 | consumed tokens: 242647040 | elapsed time per iteration (ms): 92521.8 | learning rate: 6.838E-05 | global batch size: 2048 | lm loss: 4.376199E+00 | loss scale: 16384.0 | grad norm: 10385.547 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1253/ 292968 | consumed samples: 2566144 | consumed tokens: 242909184 | elapsed time per iteration (ms): 100330.4 | learning rate: 6.843E-05 | global batch size: 2048 | lm loss: 4.350906E+00 | loss scale: 16384.0 | grad norm: 8867.763 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1254/ 292968 | consumed samples: 2568192 | consumed tokens: 243171328 | elapsed time per iteration (ms): 94271.3 | learning rate: 6.849E-05 | global batch size: 2048 | lm loss: 4.346105E+00 | loss scale: 16384.0 | grad norm: 9138.100 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1255/ 292968 | consumed samples: 2570240 | consumed tokens: 243433472 | elapsed time per iteration (ms): 89876.1 | learning rate: 6.854E-05 | global batch size: 2048 | lm loss: 4.359590E+00 | loss scale: 16384.0 | grad norm: 10671.736 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1256/ 292968 | consumed samples: 2572288 | consumed tokens: 243695616 | elapsed time per iteration (ms): 91479.6 | learning rate: 6.859E-05 | global batch size: 2048 | lm loss: 4.337495E+00 | loss scale: 16384.0 | grad norm: 9382.482 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1257/ 292968 | consumed samples: 2574336 | consumed tokens: 243957760 | elapsed time per iteration (ms): 89077.2 | learning rate: 6.865E-05 | global batch size: 2048 | lm loss: 4.360833E+00 | loss scale: 16384.0 | grad norm: 10931.909 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1258/ 292968 | consumed samples: 2576384 | consumed tokens: 244219904 | elapsed time per iteration (ms): 89543.6 | learning rate: 6.870E-05 | global batch size: 2048 | lm loss: 4.355038E+00 | loss scale: 16384.0 | grad norm: 12315.148 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1259/ 292968 | consumed samples: 2578432 | consumed tokens: 244482048 | elapsed time per iteration (ms): 86626.2 | learning rate: 6.876E-05 | global batch size: 2048 | lm loss: 4.332624E+00 | loss scale: 16384.0 | grad norm: 9028.785 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1260/ 292968 | consumed samples: 2580480 | consumed tokens: 244744192 | elapsed time per iteration (ms): 88403.0 | learning rate: 6.881E-05 | global batch size: 2048 | lm loss: 4.353878E+00 | loss scale: 16384.0 | grad norm: 8587.953 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1261/ 292968 | consumed samples: 2582528 | consumed tokens: 245006336 | elapsed time per iteration (ms): 90653.6 | learning rate: 6.887E-05 | global batch size: 2048 | lm loss: 4.406543E+00 | loss scale: 16384.0 | grad norm: 8519.735 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1262/ 292968 | consumed samples: 2584576 | consumed tokens: 245268480 | elapsed time per iteration (ms): 101721.7 | learning rate: 6.892E-05 | global batch size: 2048 | lm loss: 4.337947E+00 | loss scale: 16384.0 | grad norm: 10856.149 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1263/ 292968 | consumed samples: 2586624 | consumed tokens: 245530624 | elapsed time per iteration (ms): 98966.3 | learning rate: 6.898E-05 | global batch size: 2048 | lm loss: 4.345151E+00 | loss scale: 16384.0 | grad norm: 12642.575 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1264/ 292968 | consumed samples: 2588672 | consumed tokens: 245792768 | elapsed time per iteration (ms): 104276.2 | learning rate: 6.903E-05 | global batch size: 2048 | lm loss: 4.373935E+00 | loss scale: 16384.0 | grad norm: 13739.412 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1265/ 292968 | consumed samples: 2590720 | consumed tokens: 246054912 | elapsed time per iteration (ms): 106458.8 | learning rate: 6.909E-05 | global batch size: 2048 | lm loss: 4.336057E+00 | loss scale: 16384.0 | grad norm: 13718.934 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1266/ 292968 | consumed samples: 2592768 | consumed tokens: 246317056 | elapsed time per iteration (ms): 109558.3 | learning rate: 6.914E-05 | global batch size: 2048 | lm loss: 4.348790E+00 | loss scale: 16384.0 | grad norm: 15140.293 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1267/ 292968 | consumed samples: 2594816 | consumed tokens: 246579200 | elapsed time per iteration (ms): 101169.1 | learning rate: 6.920E-05 | global batch size: 2048 | lm loss: 4.336976E+00 | loss scale: 16384.0 | grad norm: 18580.935 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1268/ 292968 | consumed samples: 2596864 | consumed tokens: 246841344 | elapsed time per iteration (ms): 103186.3 | learning rate: 6.925E-05 | global batch size: 2048 | lm loss: 4.351308E+00 | loss scale: 16384.0 | grad norm: 9034.022 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1269/ 292968 | consumed samples: 2598912 | consumed tokens: 247103488 | elapsed time per iteration (ms): 103322.1 | learning rate: 6.930E-05 | global batch size: 2048 | lm loss: 4.338009E+00 | loss scale: 16384.0 | grad norm: 10030.218 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1270/ 292968 | consumed samples: 2600960 | consumed tokens: 247365632 | elapsed time per iteration (ms): 104430.5 | learning rate: 6.936E-05 | global batch size: 2048 | lm loss: 4.323060E+00 | loss scale: 16384.0 | grad norm: 10375.946 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1271/ 292968 | consumed samples: 2603008 | consumed tokens: 247627776 | elapsed time per iteration (ms): 101797.9 | learning rate: 6.941E-05 | global batch size: 2048 | lm loss: 4.337749E+00 | loss scale: 16384.0 | grad norm: 8465.022 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1272/ 292968 | consumed samples: 2605056 | consumed tokens: 247889920 | elapsed time per iteration (ms): 105815.4 | learning rate: 6.947E-05 | global batch size: 2048 | lm loss: 4.322408E+00 | loss scale: 16384.0 | grad norm: 8592.805 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1273/ 292968 | consumed samples: 2607104 | consumed tokens: 248152064 | elapsed time per iteration (ms): 108179.9 | learning rate: 6.952E-05 | global batch size: 2048 | lm loss: 4.321740E+00 | loss scale: 16384.0 | grad norm: 10722.339 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1274/ 292968 | consumed samples: 2609152 | consumed tokens: 248414208 | elapsed time per iteration (ms): 110063.2 | learning rate: 6.958E-05 | global batch size: 2048 | lm loss: 4.321163E+00 | loss scale: 16384.0 | grad norm: 12199.826 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1275/ 292968 | consumed samples: 2611200 | consumed tokens: 248676352 | elapsed time per iteration (ms): 112486.2 | learning rate: 6.963E-05 | global batch size: 2048 | lm loss: 4.359476E+00 | loss scale: 16384.0 | grad norm: 13015.753 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1276/ 292968 | consumed samples: 2613248 | consumed tokens: 248938496 | elapsed time per iteration (ms): 119132.6 | learning rate: 6.969E-05 | global batch size: 2048 | lm loss: 4.368865E+00 | loss scale: 16384.0 | grad norm: 12810.900 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1277/ 292968 | consumed samples: 2615296 | consumed tokens: 249200640 | elapsed time per iteration (ms): 124483.3 | learning rate: 6.974E-05 | global batch size: 2048 | lm loss: 4.319435E+00 | loss scale: 16384.0 | grad norm: 11086.670 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1278/ 292968 | consumed samples: 2617344 | consumed tokens: 249462784 | elapsed time per iteration (ms): 131501.7 | learning rate: 6.980E-05 | global batch size: 2048 | lm loss: 4.343135E+00 | loss scale: 16384.0 | grad norm: 10249.176 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1279/ 292968 | consumed samples: 2619392 | consumed tokens: 249724928 | elapsed time per iteration (ms): 122263.3 | learning rate: 6.985E-05 | global batch size: 2048 | lm loss: 4.333991E+00 | loss scale: 16384.0 | grad norm: 8418.978 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1280/ 292968 | consumed samples: 2621440 | consumed tokens: 249987072 | elapsed time per iteration (ms): 125027.7 | learning rate: 6.991E-05 | global batch size: 2048 | lm loss: 4.344658E+00 | loss scale: 16384.0 | grad norm: 9345.066 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1281/ 292968 | consumed samples: 2623488 | consumed tokens: 250249216 | elapsed time per iteration (ms): 119818.3 | learning rate: 6.996E-05 | global batch size: 2048 | lm loss: 4.340658E+00 | loss scale: 16384.0 | grad norm: 11343.930 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1282/ 292968 | consumed samples: 2625536 | consumed tokens: 250511360 | elapsed time per iteration (ms): 107960.9 | learning rate: 7.001E-05 | global batch size: 2048 | lm loss: 4.367644E+00 | loss scale: 16384.0 | grad norm: 11059.651 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1283/ 292968 | consumed samples: 2627584 | consumed tokens: 250773504 | elapsed time per iteration (ms): 103476.2 | learning rate: 7.007E-05 | global batch size: 2048 | lm loss: 4.343670E+00 | loss scale: 16384.0 | grad norm: 9443.485 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1284/ 292968 | consumed samples: 2629632 | consumed tokens: 251035648 | elapsed time per iteration (ms): 113204.7 | learning rate: 7.012E-05 | global batch size: 2048 | lm loss: 4.341036E+00 | loss scale: 16384.0 | grad norm: 10326.934 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1285/ 292968 | consumed samples: 2631680 | consumed tokens: 251297792 | elapsed time per iteration (ms): 101453.0 | learning rate: 7.018E-05 | global batch size: 2048 | lm loss: 4.335133E+00 | loss scale: 16384.0 | grad norm: 13935.373 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1286/ 292968 | consumed samples: 2633728 | consumed tokens: 251559936 | elapsed time per iteration (ms): 101126.4 | learning rate: 7.023E-05 | global batch size: 2048 | lm loss: 4.328067E+00 | loss scale: 16384.0 | grad norm: 13261.563 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1287/ 292968 | consumed samples: 2635776 | consumed tokens: 251822080 | elapsed time per iteration (ms): 101433.7 | learning rate: 7.029E-05 | global batch size: 2048 | lm loss: 4.332537E+00 | loss scale: 16384.0 | grad norm: 10151.353 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1288/ 292968 | consumed samples: 2637824 | consumed tokens: 252084224 | elapsed time per iteration (ms): 97179.0 | learning rate: 7.034E-05 | global batch size: 2048 | lm loss: 4.328178E+00 | loss scale: 16384.0 | grad norm: 12186.076 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1289/ 292968 | consumed samples: 2639872 | consumed tokens: 252346368 | elapsed time per iteration (ms): 97410.4 | learning rate: 7.040E-05 | global batch size: 2048 | lm loss: 4.303625E+00 | loss scale: 16384.0 | grad norm: 15999.316 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1290/ 292968 | consumed samples: 2641920 | consumed tokens: 252608512 | elapsed time per iteration (ms): 97712.4 | learning rate: 7.045E-05 | global batch size: 2048 | lm loss: 4.325552E+00 | loss scale: 16384.0 | grad norm: 17938.209 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1291/ 292968 | consumed samples: 2643968 | consumed tokens: 252870656 | elapsed time per iteration (ms): 97348.4 | learning rate: 7.051E-05 | global batch size: 2048 | lm loss: 4.313485E+00 | loss scale: 16384.0 | grad norm: 11220.149 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1292/ 292968 | consumed samples: 2646016 | consumed tokens: 253132800 | elapsed time per iteration (ms): 97091.0 | learning rate: 7.056E-05 | global batch size: 2048 | lm loss: 4.339503E+00 | loss scale: 16384.0 | grad norm: 15690.936 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1293/ 292968 | consumed samples: 2648064 | consumed tokens: 253394944 | elapsed time per iteration (ms): 96068.1 | learning rate: 7.062E-05 | global batch size: 2048 | lm loss: 4.308480E+00 | loss scale: 16384.0 | grad norm: 15248.013 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1294/ 292968 | consumed samples: 2650112 | consumed tokens: 253657088 | elapsed time per iteration (ms): 101209.6 | learning rate: 7.067E-05 | global batch size: 2048 | lm loss: 4.299973E+00 | loss scale: 16384.0 | grad norm: 10467.217 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1295/ 292968 | consumed samples: 2652160 | consumed tokens: 253919232 | elapsed time per iteration (ms): 106905.6 | learning rate: 7.072E-05 | global batch size: 2048 | lm loss: 4.325128E+00 | loss scale: 16384.0 | grad norm: 10645.088 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1296/ 292968 | consumed samples: 2654208 | consumed tokens: 254181376 | elapsed time per iteration (ms): 104630.7 | learning rate: 7.078E-05 | global batch size: 2048 | lm loss: 4.317550E+00 | loss scale: 16384.0 | grad norm: 10104.458 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1297/ 292968 | consumed samples: 2656256 | consumed tokens: 254443520 | elapsed time per iteration (ms): 108402.3 | learning rate: 7.083E-05 | global batch size: 2048 | lm loss: 4.301074E+00 | loss scale: 16384.0 | grad norm: 10153.653 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1298/ 292968 | consumed samples: 2658304 | consumed tokens: 254705664 | elapsed time per iteration (ms): 101393.9 | learning rate: 7.089E-05 | global batch size: 2048 | lm loss: 4.313783E+00 | loss scale: 16384.0 | grad norm: 11186.819 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1299/ 292968 | consumed samples: 2660352 | consumed tokens: 254967808 | elapsed time per iteration (ms): 97468.1 | learning rate: 7.094E-05 | global batch size: 2048 | lm loss: 4.331973E+00 | loss scale: 16384.0 | grad norm: 10929.262 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1300/ 292968 | consumed samples: 2662400 | consumed tokens: 255229952 | elapsed time per iteration (ms): 103670.2 | learning rate: 7.100E-05 | global batch size: 2048 | lm loss: 4.320304E+00 | loss scale: 16384.0 | grad norm: 9919.120 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1301/ 292968 | consumed samples: 2664448 | consumed tokens: 255492096 | elapsed time per iteration (ms): 103703.3 | learning rate: 7.105E-05 | global batch size: 2048 | lm loss: 4.336925E+00 | loss scale: 16384.0 | grad norm: 10814.834 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1302/ 292968 | consumed samples: 2666496 | consumed tokens: 255754240 | elapsed time per iteration (ms): 96139.5 | learning rate: 7.111E-05 | global batch size: 2048 | lm loss: 4.318452E+00 | loss scale: 16384.0 | grad norm: 11068.371 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1303/ 292968 | consumed samples: 2668544 | consumed tokens: 256016384 | elapsed time per iteration (ms): 92160.2 | learning rate: 7.116E-05 | global batch size: 2048 | lm loss: 4.331538E+00 | loss scale: 16384.0 | grad norm: 10972.349 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1304/ 292968 | consumed samples: 2670592 | consumed tokens: 256278528 | elapsed time per iteration (ms): 87573.4 | learning rate: 7.122E-05 | global batch size: 2048 | lm loss: 4.307694E+00 | loss scale: 16384.0 | grad norm: 13438.511 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1305/ 292968 | consumed samples: 2672640 | consumed tokens: 256540672 | elapsed time per iteration (ms): 86671.4 | learning rate: 7.127E-05 | global batch size: 2048 | lm loss: 4.338923E+00 | loss scale: 16384.0 | grad norm: 19454.195 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1306/ 292968 | consumed samples: 2674688 | consumed tokens: 256802816 | elapsed time per iteration (ms): 87566.0 | learning rate: 7.133E-05 | global batch size: 2048 | lm loss: 4.320871E+00 | loss scale: 16384.0 | grad norm: 13488.959 | num zeros: 0.0 | curriculum seqlen: 128 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1307/ 292968 | consumed samples: 2676736 | consumed tokens: 257081344 | elapsed time per iteration (ms): 102038.5 | learning rate: 7.138E-05 | global batch size: 2048 | lm loss: 4.413541E+00 | loss scale: 16384.0 | grad norm: 18168.800 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1308/ 292968 | consumed samples: 2678784 | consumed tokens: 257359872 | elapsed time per iteration (ms): 109015.4 | learning rate: 7.143E-05 | global batch size: 2048 | lm loss: 4.372187E+00 | loss scale: 16384.0 | grad norm: 10812.401 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1309/ 292968 | consumed samples: 2680832 | consumed tokens: 257638400 | elapsed time per iteration (ms): 106725.5 | learning rate: 7.149E-05 | global batch size: 2048 | lm loss: 4.395649E+00 | loss scale: 16384.0 | grad norm: 13451.504 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1310/ 292968 | consumed samples: 2682880 | consumed tokens: 257916928 | elapsed time per iteration (ms): 109015.2 | learning rate: 7.154E-05 | global batch size: 2048 | lm loss: 4.441962E+00 | loss scale: 16384.0 | grad norm: 19299.987 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1311/ 292968 | consumed samples: 2684928 | consumed tokens: 258195456 | elapsed time per iteration (ms): 104596.5 | learning rate: 7.160E-05 | global batch size: 2048 | lm loss: 4.378983E+00 | loss scale: 16384.0 | grad norm: 11561.969 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1312/ 292968 | consumed samples: 2686976 | consumed tokens: 258473984 | elapsed time per iteration (ms): 103802.3 | learning rate: 7.165E-05 | global batch size: 2048 | lm loss: 4.374365E+00 | loss scale: 16384.0 | grad norm: 13670.889 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1313/ 292968 | consumed samples: 2689024 | consumed tokens: 258752512 | elapsed time per iteration (ms): 103736.3 | learning rate: 7.171E-05 | global batch size: 2048 | lm loss: 4.348674E+00 | loss scale: 16384.0 | grad norm: 10213.036 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1314/ 292968 | consumed samples: 2691072 | consumed tokens: 259031040 | elapsed time per iteration (ms): 103663.9 | learning rate: 7.176E-05 | global batch size: 2048 | lm loss: 4.331293E+00 | loss scale: 16384.0 | grad norm: 13151.653 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1315/ 292968 | consumed samples: 2693120 | consumed tokens: 259309568 | elapsed time per iteration (ms): 103760.9 | learning rate: 7.182E-05 | global batch size: 2048 | lm loss: 4.315998E+00 | loss scale: 16384.0 | grad norm: 14473.062 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1316/ 292968 | consumed samples: 2695168 | consumed tokens: 259588096 | elapsed time per iteration (ms): 104084.0 | learning rate: 7.187E-05 | global batch size: 2048 | lm loss: 4.349117E+00 | loss scale: 16384.0 | grad norm: 11313.236 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1317/ 292968 | consumed samples: 2697216 | consumed tokens: 259866624 | elapsed time per iteration (ms): 105133.0 | learning rate: 7.193E-05 | global batch size: 2048 | lm loss: 4.324214E+00 | loss scale: 16384.0 | grad norm: 15165.408 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1318/ 292968 | consumed samples: 2699264 | consumed tokens: 260145152 | elapsed time per iteration (ms): 103961.9 | learning rate: 7.198E-05 | global batch size: 2048 | lm loss: 4.297659E+00 | loss scale: 16384.0 | grad norm: 13970.172 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1319/ 292968 | consumed samples: 2701312 | consumed tokens: 260423680 | elapsed time per iteration (ms): 103869.3 | learning rate: 7.203E-05 | global batch size: 2048 | lm loss: 4.315687E+00 | loss scale: 16384.0 | grad norm: 12823.779 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1320/ 292968 | consumed samples: 2703360 | consumed tokens: 260702208 | elapsed time per iteration (ms): 105499.5 | learning rate: 7.209E-05 | global batch size: 2048 | lm loss: 4.339356E+00 | loss scale: 16384.0 | grad norm: 12505.072 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1321/ 292968 | consumed samples: 2705408 | consumed tokens: 260980736 | elapsed time per iteration (ms): 106715.5 | learning rate: 7.214E-05 | global batch size: 2048 | lm loss: 4.322292E+00 | loss scale: 16384.0 | grad norm: 7680.711 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1322/ 292968 | consumed samples: 2707456 | consumed tokens: 261259264 | elapsed time per iteration (ms): 104743.5 | learning rate: 7.220E-05 | global batch size: 2048 | lm loss: 4.303059E+00 | loss scale: 16384.0 | grad norm: 11274.482 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1323/ 292968 | consumed samples: 2709504 | consumed tokens: 261537792 | elapsed time per iteration (ms): 108461.6 | learning rate: 7.225E-05 | global batch size: 2048 | lm loss: 4.283995E+00 | loss scale: 16384.0 | grad norm: 11434.034 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1324/ 292968 | consumed samples: 2711552 | consumed tokens: 261816320 | elapsed time per iteration (ms): 113653.2 | learning rate: 7.231E-05 | global batch size: 2048 | lm loss: 4.292516E+00 | loss scale: 16384.0 | grad norm: 9910.438 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1325/ 292968 | consumed samples: 2713600 | consumed tokens: 262094848 | elapsed time per iteration (ms): 113595.4 | learning rate: 7.236E-05 | global batch size: 2048 | lm loss: 4.305782E+00 | loss scale: 16384.0 | grad norm: 9792.060 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1326/ 292968 | consumed samples: 2715648 | consumed tokens: 262373376 | elapsed time per iteration (ms): 106966.1 | learning rate: 7.242E-05 | global batch size: 2048 | lm loss: 4.298875E+00 | loss scale: 16384.0 | grad norm: 9256.978 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1327/ 292968 | consumed samples: 2717696 | consumed tokens: 262651904 | elapsed time per iteration (ms): 112772.2 | learning rate: 7.247E-05 | global batch size: 2048 | lm loss: 4.275658E+00 | loss scale: 16384.0 | grad norm: 12353.776 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1328/ 292968 | consumed samples: 2719744 | consumed tokens: 262930432 | elapsed time per iteration (ms): 116094.4 | learning rate: 7.253E-05 | global batch size: 2048 | lm loss: 4.294221E+00 | loss scale: 16384.0 | grad norm: 15819.284 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1329/ 292968 | consumed samples: 2721792 | consumed tokens: 263208960 | elapsed time per iteration (ms): 108861.8 | learning rate: 7.258E-05 | global batch size: 2048 | lm loss: 4.278796E+00 | loss scale: 16384.0 | grad norm: 14416.408 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1330/ 292968 | consumed samples: 2723840 | consumed tokens: 263487488 | elapsed time per iteration (ms): 111717.3 | learning rate: 7.264E-05 | global batch size: 2048 | lm loss: 4.279788E+00 | loss scale: 16384.0 | grad norm: 10858.691 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1331/ 292968 | consumed samples: 2725888 | consumed tokens: 263766016 | elapsed time per iteration (ms): 106840.2 | learning rate: 7.269E-05 | global batch size: 2048 | lm loss: 4.321123E+00 | loss scale: 16384.0 | grad norm: 16413.887 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1332/ 292968 | consumed samples: 2727936 | consumed tokens: 264044544 | elapsed time per iteration (ms): 105046.3 | learning rate: 7.274E-05 | global batch size: 2048 | lm loss: 4.286259E+00 | loss scale: 16384.0 | grad norm: 13602.333 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1333/ 292968 | consumed samples: 2729984 | consumed tokens: 264323072 | elapsed time per iteration (ms): 103539.0 | learning rate: 7.280E-05 | global batch size: 2048 | lm loss: 4.311579E+00 | loss scale: 16384.0 | grad norm: 12268.700 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1334/ 292968 | consumed samples: 2732032 | consumed tokens: 264601600 | elapsed time per iteration (ms): 104597.9 | learning rate: 7.285E-05 | global batch size: 2048 | lm loss: 4.297973E+00 | loss scale: 16384.0 | grad norm: 11817.463 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1335/ 292968 | consumed samples: 2734080 | consumed tokens: 264880128 | elapsed time per iteration (ms): 106853.2 | learning rate: 7.291E-05 | global batch size: 2048 | lm loss: 4.288142E+00 | loss scale: 16384.0 | grad norm: 9158.477 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1336/ 292968 | consumed samples: 2736128 | consumed tokens: 265158656 | elapsed time per iteration (ms): 109768.8 | learning rate: 7.296E-05 | global batch size: 2048 | lm loss: 4.275808E+00 | loss scale: 16384.0 | grad norm: 9550.713 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1337/ 292968 | consumed samples: 2738176 | consumed tokens: 265437184 | elapsed time per iteration (ms): 106402.4 | learning rate: 7.302E-05 | global batch size: 2048 | lm loss: 4.278894E+00 | loss scale: 16384.0 | grad norm: 8149.629 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1338/ 292968 | consumed samples: 2740224 | consumed tokens: 265715712 | elapsed time per iteration (ms): 104883.4 | learning rate: 7.307E-05 | global batch size: 2048 | lm loss: 4.285826E+00 | loss scale: 16384.0 | grad norm: 8283.185 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1339/ 292968 | consumed samples: 2742272 | consumed tokens: 265994240 | elapsed time per iteration (ms): 105272.5 | learning rate: 7.313E-05 | global batch size: 2048 | lm loss: 4.284776E+00 | loss scale: 16384.0 | grad norm: 8637.702 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1340/ 292968 | consumed samples: 2744320 | consumed tokens: 266272768 | elapsed time per iteration (ms): 102678.5 | learning rate: 7.318E-05 | global batch size: 2048 | lm loss: 4.302094E+00 | loss scale: 16384.0 | grad norm: 8230.286 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1341/ 292968 | consumed samples: 2746368 | consumed tokens: 266551296 | elapsed time per iteration (ms): 103750.2 | learning rate: 7.324E-05 | global batch size: 2048 | lm loss: 4.306873E+00 | loss scale: 16384.0 | grad norm: 12167.833 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1342/ 292968 | consumed samples: 2748416 | consumed tokens: 266829824 | elapsed time per iteration (ms): 104922.5 | learning rate: 7.329E-05 | global batch size: 2048 | lm loss: 4.294527E+00 | loss scale: 16384.0 | grad norm: 11905.773 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1343/ 292968 | consumed samples: 2750464 | consumed tokens: 267108352 | elapsed time per iteration (ms): 103900.0 | learning rate: 7.335E-05 | global batch size: 2048 | lm loss: 4.295758E+00 | loss scale: 16384.0 | grad norm: 12966.247 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1344/ 292968 | consumed samples: 2752512 | consumed tokens: 267386880 | elapsed time per iteration (ms): 112773.0 | learning rate: 7.340E-05 | global batch size: 2048 | lm loss: 4.293741E+00 | loss scale: 16384.0 | grad norm: 17679.849 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1345/ 292968 | consumed samples: 2754560 | consumed tokens: 267665408 | elapsed time per iteration (ms): 107333.9 | learning rate: 7.345E-05 | global batch size: 2048 | lm loss: 4.285107E+00 | loss scale: 16384.0 | grad norm: 12319.450 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1346/ 292968 | consumed samples: 2756608 | consumed tokens: 267943936 | elapsed time per iteration (ms): 107084.2 | learning rate: 7.351E-05 | global batch size: 2048 | lm loss: 4.317650E+00 | loss scale: 16384.0 | grad norm: 10941.971 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1347/ 292968 | consumed samples: 2758656 | consumed tokens: 268222464 | elapsed time per iteration (ms): 104355.1 | learning rate: 7.356E-05 | global batch size: 2048 | lm loss: 4.266949E+00 | loss scale: 16384.0 | grad norm: 8940.800 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1348/ 292968 | consumed samples: 2760704 | consumed tokens: 268500992 | elapsed time per iteration (ms): 102429.5 | learning rate: 7.362E-05 | global batch size: 2048 | lm loss: 4.283114E+00 | loss scale: 16384.0 | grad norm: 7895.135 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1349/ 292968 | consumed samples: 2762752 | consumed tokens: 268779520 | elapsed time per iteration (ms): 105154.4 | learning rate: 7.367E-05 | global batch size: 2048 | lm loss: 4.285004E+00 | loss scale: 16384.0 | grad norm: 9430.716 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1350/ 292968 | consumed samples: 2764800 | consumed tokens: 269058048 | elapsed time per iteration (ms): 103674.9 | learning rate: 7.373E-05 | global batch size: 2048 | lm loss: 4.279161E+00 | loss scale: 16384.0 | grad norm: 10926.594 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 1350 | lm loss value: 4.259500E+00 | lm loss PPL: 7.077459E+01 | ------------------------------------------------------------------------------------------------- - iteration 1351/ 292968 | consumed samples: 2766848 | consumed tokens: 269336576 | elapsed time per iteration (ms): 274611.4 | learning rate: 7.378E-05 | global batch size: 2048 | lm loss: 4.258837E+00 | loss scale: 16384.0 | grad norm: 10373.234 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1352/ 292968 | consumed samples: 2768896 | consumed tokens: 269615104 | elapsed time per iteration (ms): 106646.8 | learning rate: 7.384E-05 | global batch size: 2048 | lm loss: 4.268482E+00 | loss scale: 16384.0 | grad norm: 9422.137 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1353/ 292968 | consumed samples: 2770944 | consumed tokens: 269893632 | elapsed time per iteration (ms): 109903.2 | learning rate: 7.389E-05 | global batch size: 2048 | lm loss: 4.249788E+00 | loss scale: 16384.0 | grad norm: 9869.253 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1354/ 292968 | consumed samples: 2772992 | consumed tokens: 270172160 | elapsed time per iteration (ms): 104478.9 | learning rate: 7.395E-05 | global batch size: 2048 | lm loss: 4.269929E+00 | loss scale: 16384.0 | grad norm: 14670.245 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1355/ 292968 | consumed samples: 2775040 | consumed tokens: 270450688 | elapsed time per iteration (ms): 104033.5 | learning rate: 7.400E-05 | global batch size: 2048 | lm loss: 4.291121E+00 | loss scale: 16384.0 | grad norm: 17109.005 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1356/ 292968 | consumed samples: 2777088 | consumed tokens: 270729216 | elapsed time per iteration (ms): 103055.2 | learning rate: 7.406E-05 | global batch size: 2048 | lm loss: 4.270620E+00 | loss scale: 16384.0 | grad norm: 11280.739 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1357/ 292968 | consumed samples: 2779136 | consumed tokens: 271007744 | elapsed time per iteration (ms): 102621.3 | learning rate: 7.411E-05 | global batch size: 2048 | lm loss: 4.277614E+00 | loss scale: 16384.0 | grad norm: 9553.789 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1358/ 292968 | consumed samples: 2781184 | consumed tokens: 271286272 | elapsed time per iteration (ms): 103434.3 | learning rate: 7.416E-05 | global batch size: 2048 | lm loss: 4.257460E+00 | loss scale: 16384.0 | grad norm: 12285.977 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1359/ 292968 | consumed samples: 2783232 | consumed tokens: 271564800 | elapsed time per iteration (ms): 104099.8 | learning rate: 7.422E-05 | global batch size: 2048 | lm loss: 4.267920E+00 | loss scale: 16384.0 | grad norm: 11875.146 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1360/ 292968 | consumed samples: 2785280 | consumed tokens: 271843328 | elapsed time per iteration (ms): 101938.2 | learning rate: 7.427E-05 | global batch size: 2048 | lm loss: 4.280769E+00 | loss scale: 16384.0 | grad norm: 12682.034 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1361/ 292968 | consumed samples: 2787328 | consumed tokens: 272121856 | elapsed time per iteration (ms): 103074.6 | learning rate: 7.433E-05 | global batch size: 2048 | lm loss: 4.259530E+00 | loss scale: 16384.0 | grad norm: 11334.140 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -saving checkpoint at iteration 1361 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-26 12:51:31,633] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/mp_rank_00_model_states.pt -[2021-10-26 12:51:31,781] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/mp_rank_01_model_states.pt -[2021-10-26 12:51:44,629] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-26 12:51:44,723] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-26 12:51:44,798] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-26 12:51:44,829] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-26 12:51:44,839] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-26 12:51:44,844] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-26 12:51:44,887] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-26 12:51:44,894] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-26 12:51:44,904] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-26 12:51:44,918] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-26 12:51:44,940] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-26 12:51:44,949] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-26 12:51:44,953] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-26 12:51:44,981] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-26 12:51:44,985] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-26 12:51:44,988] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-26 12:51:45,002] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-26 12:51:45,003] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-26 12:51:45,005] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-26 12:51:45,010] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-26 12:51:45,110] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-26 12:51:45,118] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-26 12:51:45,187] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-26 12:51:45,204] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-26 12:51:45,237] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-26 12:51:45,243] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-26 12:51:45,291] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-26 12:51:45,361] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-26 12:51:45,379] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-26 12:51:45,458] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-26 12:51:45,532] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-26 12:51:45,754] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-26 12:51:45,816] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-26 12:51:45,826] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-26 12:51:45,834] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-26 12:51:45,853] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-26 12:51:45,858] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-26 12:51:45,860] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-26 12:51:45,861] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-26 12:51:45,875] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-26 12:51:45,886] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-26 12:51:45,894] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-26 12:51:45,896] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-26 12:51:45,920] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-26 12:51:45,926] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-26 12:51:45,933] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-26 12:51:45,949] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-26 12:51:45,960] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-26 12:51:45,998] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-26 12:51:46,017] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-26 12:51:46,021] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-26 12:51:46,035] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-26 12:51:46,046] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-26 12:51:46,060] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-26 12:51:46,072] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-26 12:51:46,084] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-26 12:51:46,097] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-26 12:51:46,101] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-26 12:51:46,120] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-26 12:51:46,122] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-26 12:51:46,127] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-26 12:51:46,155] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-26 12:51:46,179] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-26 12:51:46,182] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-26 12:51:46,200] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-26 12:51:46,201] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-26 12:51:46,204] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-26 12:51:46,247] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-26 12:51:46,251] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-26 12:51:46,268] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-26 12:51:46,279] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-26 12:51:46,290] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-26 12:51:46,324] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-26 12:51:46,325] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-26 12:51:46,327] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-26 12:51:46,371] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-26 12:51:46,380] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-26 12:51:46,421] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-26 12:51:46,446] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-26 12:51:46,454] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-26 12:51:46,496] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-26 12:51:46,521] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-26 12:51:46,575] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-26 12:51:46,577] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-26 12:51:46,616] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-26 12:51:46,623] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-26 12:51:46,644] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-26 12:51:46,694] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-26 12:51:46,706] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-26 12:51:46,739] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-26 12:51:46,749] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-26 12:51:46,756] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-26 12:51:46,763] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-26 12:51:46,775] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-26 12:51:46,799] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-26 12:51:46,802] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-26 12:51:46,845] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-26 12:51:46,866] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-26 12:51:46,888] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-26 12:51:46,908] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-26 12:51:46,935] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-26 12:51:46,952] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-26 12:51:47,002] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-26 12:51:47,024] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-26 12:51:47,176] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-26 12:51:47,203] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-26 12:51:47,210] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-26 12:51:47,221] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-26 12:51:47,221] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-26 12:51:47,292] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-26 12:51:47,329] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-26 12:51:47,390] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-26 12:51:47,481] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-26 12:51:47,627] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-26 12:51:47,658] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-26 12:51:47,794] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-26 12:51:48,908] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-26 12:51:48,920] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-26 12:51:48,960] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-26 12:51:48,970] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-26 12:51:49,184] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-26 12:51:50,375] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-26 12:51:50,758] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-26 12:51:52,864] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-26 12:51:53,073] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-26 12:51:53,099] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-26 12:51:53,415] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-26 12:51:53,950] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1361/zero_pp_rank_0_mp_rank_29_optim_states.pt - successfully saved checkpoint at iteration 1361 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 25533.31 -[exiting program after 1191.2136971910795 minutes] datetime: 2021-10-26 12:51:54 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ninja................ installed.................. ..[OKAY] compatible - ----------------------------------------------------------------------------------------------------- - -op name ................ installed .. compatible -cpu_adam --------------------------------------------------............... - [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] .......fused_adam [OKAY]............. - [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... sparse_attn[OKAY] -............ [NO] .......stochastic_transformer [OKAY] -. [NO] .......transformer [OKAY]............ - [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ninja............ [NO].................. .......[OKAY] [OKAY] - --------------------------------------------------- -transformerop name ............................ [NO]installed ......... [OKAY]compatible - --------------------------------------------------- -stochastic_transformer . [NO] cpu_adam....... ...............[OKAY] -[NO] ....... [OKAY] -ninja .................. [OKAY]fused_adam - .............-------------------------------------------------- -[NO] op name ....................... [OKAY]installed - .. compatible -fused_lamb-------------------------------------------------- -............. [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -sparse_attn ............ [NO] .......fused_adam [OKAY]............. - [NO]transformer ................... [OKAY] -[NO] ....... [OKAY]fused_lamb - ............. [NO] .......stochastic_transformer [OKAY] -. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ninja................ installed.................. ..[OKAY] -compatible ----------------------------------------------------------------------------------------------------- - -op name ................ installed .. compatiblecpu_adam - --------------------------------------------------............... - [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_adam fused_lamb............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformersparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformer ............stochastic_transformer [NO] ........ [OKAY][NO] - ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adamninja ............... ..................[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -op name ................ installed .. compatible -fused_adam --------------------------------------------------............. -[NO] ....... [OKAY] -fused_lambcpu_adam ............. ...............[NO] [NO]....... [OKAY]....... - [OKAY] -sparse_attnfused_adam ......................... [NO] [NO]....... [OKAY]....... - [OKAY]transformer - ............ [NO] .......fused_lamb [OKAY]............. - [NO]stochastic_transformer ....... .[OKAY] -[NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................. ..................[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -op name op name................ ................installed installed.. ..compatible -compatible --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam............... [NO]............... .......[NO] [OKAY]....... - [OKAY] -ninja ..................fused_adam fused_adam [OKAY] ............. -............. --------------------------------------------------[NO][NO] - .......op name....... [OKAY] ................ -[OKAY] installed - fused_lamb..fused_lamb compatible ............. -............. --------------------------------------------------[NO][NO] - .............. [OKAY][OKAY] - -cpu_adam ............... [NO] ....... [OKAY] -sparse_attn ............sparse_attn [NO]............ .......[NO] [OKAY]....... - fused_adam[OKAY] transformer -............. transformer............[NO] ............[NO]....... .......[NO][OKAY] -[OKAY]....... - [OKAY] -fused_lambstochastic_transformer .............stochastic_transformer .[NO] .[NO]....... [NO][OKAY]....... - [OKAY]....... - [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninjasparse_attn ............ ..................[NO] .......[OKAY] [OKAY] - --------------------------------------------------- -transformerop name ............................ [NO] installed....... ..[OKAY] - compatible ---------------------------------------------------stochastic_transformer - . [NO] ....... [OKAY]cpu_adam - ............... [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -fused_adamop name ............................. [NO]installed ......... compatible[OKAY] - --------------------------------------------------- -fused_lamb ............. [NO] ....... [OKAY]cpu_adam - ............... [NO] ....... [OKAY] -ninjasparse_attn ............fused_adam [NO]............................... .......[OKAY][NO] - [OKAY]....... - --------------------------------------------------[OKAY]transformer - - ............op name [NO] fused_lamb ................ .................... [NO][OKAY]installed -....... [OKAY]..stochastic_transformer - compatible. - [NO]-------------------------------------------------- ....... - [OKAY] -sparse_attn ............ [NO]cpu_adam ....... [OKAY] -............... transformer[NO] ............ .......[NO] .......[OKAY] [OKAY] - -stochastic_transformer . [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninja .................................... [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adamninja .............................. .................. [NO] [NO] [OKAY] ....... - .......-------------------------------------------------- [OKAY] -[OKAY] -op name - ninja................ installed.................. ..[OKAY] compatiblefused_adam - -fused_adam -------------------------------------------------- --------------------------------------------------............. - -............. [NO]op name[NO] ....................... ....... [OKAY]cpu_adam[OKAY] -installed - ................. fused_lamb[NO]compatiblefused_lamb - .......-------------------------------------------------- ............. -.............[OKAY] -[NO][NO] .............. [OKAY][OKAY] - -cpu_adam ...............fused_adam [NO]............. sparse_attnsparse_attn.......[NO] [OKAY] ............ -................... [NO][NO][OKAY] -.............. [OKAY][OKAY] - -fused_lamb transformertransformer............. ............[NO]............fused_adam [NO]....... [NO]............. ....... [OKAY] -.......[NO] [OKAY][OKAY]....... - - [OKAY] -stochastic_transformerstochastic_transformer fused_lamb. ..............sparse_attn[NO] [NO] ............[NO]....... [NO].......[OKAY] ....... - [OKAY].......[OKAY] - -[OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer sparse_attn. ............[NO] ....... [OKAY] -[NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ninja....... ..................[OKAY] [OKAY] - --------------------------------------------------- -op name ................ installed .. compatiblesparse_attn - --------------------------------------------------............ - [NO] ....... [OKAY] -transformercpu_adam ninja........................... ..................[NO][NO] [OKAY].............. - [OKAY]-------------------------------------------------- - -[OKAY] -op name ................ installedstochastic_transformer .. compatible.fused_adam --------------------------------------------------- -[NO]............. [NO]....... .......[OKAY]cpu_adam [OKAY] - -ninja............... [NO]fused_lamb .................. .................... [OKAY][NO][OKAY] - ....... - [OKAY]-------------------------------------------------- - -op name ................ installed fused_adam.. .............compatible -[NO]--------------------------------------------------sparse_attn -....... ............[OKAY] -[NO] ....... fused_lambcpu_adam[OKAY] - ............. ...............[NO] transformer.......[NO] ............ [OKAY] ....... -[NO] .......[OKAY] -[OKAY] -stochastic_transformer . [NO]sparse_attn ................... fused_adam[OKAY] -[NO]............. ....... [OKAY][NO] - ....... transformer[OKAY] -............ [NO] .......fused_lamb [OKAY] -............. [NO] stochastic_transformer....... [OKAY]. - [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam .............ninja [NO] ..................ninja....... ninja [OKAY][OKAY] .................. - -.................. --------------------------------------------------[OKAY] -[OKAY] -op namefused_lamb - -------------------------------------------------- --------------------------------------------------................ - - .............op nameinstalled op name [NO].. ................................compatible -installed.......installed -------------------------------------------------- .. -..[OKAY] -compatiblecompatible - --------------------------------------------------- ---------------------------------------------------cpu_adam - ............... [NO] ....... [OKAY]cpu_adamsparse_attn cpu_adam............... -............ ............... [NO] [NO] [NO] ....... .......[OKAY]....... -[OKAY] fused_adam -[OKAY] -transformer ......................... [NO][NO] fused_adam fused_adam........................... [OKAY][NO] - .............[OKAY] - .......stochastic_transformer[NO] [OKAY]........ - fused_lamb[OKAY][NO] - fused_lamb.................... .............[NO]fused_lamb [OKAY] [NO] - .................... .......[OKAY][NO] - ....... [OKAY] -[OKAY] -sparse_attn ............ [NO] ....... sparse_attn[OKAY] -............ transformer[NO] ............sparse_attn....... [NO] ............ [OKAY]....... - [OKAY][NO] - transformer....... ............stochastic_transformer[OKAY] -.[NO] transformer [NO] ....... ............ ....... [OKAY] [NO] -[OKAY] -....... [OKAY] -stochastic_transformer .stochastic_transformer [NO] ....... [OKAY] -. [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninja--------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninja .................................... [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- -op name op name................ ................installed installed.. ..compatible -compatible ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb .............fused_lamb [NO]............. .......[NO] [OKAY]....... - [OKAY]ninja - .................. [OKAY] ---------------------------------------------------ninja - op name.................. ................sparse_attn[OKAY] installedsparse_attn............ - ..--------------------------------------------------............[NO] - [NO]compatible op name....... - ....... [OKAY] --------------------------------------------------................ - -[OKAY]installed - transformer..transformer cpu_adam............ compatible............ ...............[NO] - [NO][NO]--------------------------------------------------....... - ..............[OKAY] - [OKAY][OKAY] - -cpu_adamstochastic_transformer ............... stochastic_transformer.[NO] [NO]........fused_adam ....... [NO] [OKAY] -....................[OKAY] [NO] -[OKAY] -....... [OKAY] -fused_adamfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn transformer............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -transformer ............ stochastic_transformer[NO] ....... .[OKAY] -[NO] .......stochastic_transformer [OKAY] -. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] -ninja-------------------------------------------------- - ..................op name [OKAY]................ - installed-------------------------------------------------- -.. op namecompatible -................-------------------------------------------------- -installed .. compatibleninja --------------------------------------------------- -cpu_adam.................. ...............[OKAY] -[NO] .......-------------------------------------------------- -[OKAY] -op namecpu_adam ............................... installed[NO] ......... compatible[OKAY] - ---------------------------------------------------fused_adam - ............. [NO] ....... [OKAY] -cpu_adam ...............fused_lamb fused_adam [NO] ............. ............. ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -fused_lamb ............. fused_adam[NO] .................... [NO][OKAY]sparse_attn - ................... [OKAY][NO] - ....... [OKAY] -fused_lamb transformer............. ............[NO] [NO]....... sparse_attn.......[OKAY] -[OKAY]............ - [NO] ....... [OKAY]stochastic_transformer - . transformer[NO] ...................sparse_attn [OKAY][NO]............ - .......[NO] [OKAY]....... - [OKAY] -stochastic_transformertransformer ............ .[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -ninjafused_lamb ............................... [OKAY][NO] - .......-------------------------------------------------- -[OKAY] -op name ................ installed .. compatible --------------------------------------------------- -sparse_attn ............ [NO] cpu_adam....... ...............[OKAY] -[NO] .......transformer [OKAY] -............ [NO] ....... [OKAY] -stochastic_transformer fused_adam .............. [NO][NO] .............. [OKAY][OKAY] - -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adam .............fused_adam [NO]............. .......[NO] [OKAY]....... - [OKAY] -fused_lamb .............fused_lamb [NO]............. .......[NO] [OKAY]....... - [OKAY] -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -transformer transformer............ ............[NO] [NO]....... [OKAY]....... - [OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. [OKAY] -cpu_adam --------------------------------------------------............... - [NO]op name ....... ................[OKAY] -installed .. compatible --------------------------------------------------- -fused_adam ............. cpu_adam[NO] ...................... [NO][OKAY] -....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -sparse_attnfused_lamb ......................... [NO][NO] .............. [OKAY][OKAY] - -transformer ............ [NO] ....... [OKAY] -sparse_attnstochastic_transformer ............ [NO]. .......[NO] .......[OKAY] -[OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. [OKAY] -cpu_adam-------------------------------------------------- -............... op name[NO] ....................... installed[OKAY] -.. compatible --------------------------------------------------- -fused_adam .............cpu_adam [NO]............... .......[NO] [OKAY] -....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb .............sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformer ............ [NO] ....... [OKAY] -sparse_attn ............stochastic_transformer [NO] ........ [OKAY][NO] - ....... transformer[OKAY] -............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninjafused_adam ............................... [OKAY][NO] - --------------------------------------------------....... - op name[OKAY] -................ installed .. fused_lambcompatible - -------------------------------------------------- -............. [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -sparse_attnfused_adam ......................... [NO] .......[NO] [OKAY] -....... fused_lamb[OKAY] ............. - [NO] .......transformer [OKAY] -............ [NO] ....... [OKAY] -stochastic_transformer sparse_attn ............. [NO] [NO]....... .......[OKAY] - transformer[OKAY] -............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... ninja[NO] ......................... [OKAY][OKAY] - --------------------------------------------------- -op name ................ installed ..fused_adam compatible............. - [NO]-------------------------------------------------- -....... [OKAY] -fused_lambcpu_adam ............. ...............ninja[NO] [NO]......................... ninja[OKAY][OKAY] ....... - -.................. [OKAY]--------------------------------------------------[OKAY] - - -op name --------------------------------------------------................ - installedop name sparse_attn .. ............................fused_adam compatibleinstalled.............[NO] - .......--------------------------------------------------.. -[NO][OKAY] - compatible....... - transformer--------------------------------------------------[OKAY] -cpu_adam -............ ...............[NO] fused_lamb [NO]....... ............. cpu_adam....... [OKAY] [NO] - ............... [OKAY] ....... - stochastic_transformer[NO][OKAY] -........ [OKAY][NO]fused_adam - .................... [OKAY][NO] - ....... [OKAY]sparse_attn - ............ fused_lamb[NO] fused_adam ............. ....................[NO] [OKAY].......[NO] - [OKAY]....... - transformer[OKAY] -............ [NO]fused_lamb .................... [OKAY][NO] -....... [OKAY] -sparse_attnstochastic_transformer ............ [NO]. ....... [NO][OKAY]sparse_attn - ...................transformer [OKAY][NO]............ - [NO]....... ....... [OKAY][OKAY] - -transformer stochastic_transformer............ [NO]. .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ninja [NO] ......................... [OKAY][OKAY] - --------------------------------------------------- -stochastic_transformer op name ................. [NO]installed ......... [OKAY]compatible - --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ninja....... [OKAY].................. - [OKAY] --------------------------------------------------- -op name ................fused_adam installed............. ..[NO] compatible....... - [OKAY]-------------------------------------------------- - -fused_lamb ............. [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] .......fused_adam [OKAY]............. - [NO]stochastic_transformer ........ [OKAY][NO] - ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninjacpu_adam ................................. [NO][OKAY] -....... [OKAY]-------------------------------------------------- - -op name ................ installed .. compatible --------------------------------------------------- -fused_adam ............. [NO] ....... cpu_adam[OKAY] -............... [NO] ....... fused_lamb[OKAY] -............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY]sparse_attn - ............ [NO] ....... fused_lamb[OKAY] -............. [NO]transformer ................... [OKAY][NO] - ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY]sparse_attn - ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja ..................ninja [OKAY] -.................. --------------------------------------------------[OKAY] - -op name-------------------------------------------------- -................ op nameinstalled .................. compatibleinstalled - ..-------------------------------------------------- -compatible --------------------------------------------------- -cpu_adam ...............cpu_adam [NO]............... .......[NO] [OKAY]....... - [OKAY] -fused_adam .............fused_adam [NO]............. .......[NO] [OKAY] -....... [OKAY] -fused_lamb ............. fused_lamb[NO] .................... [NO][OKAY] -....... [OKAY] -sparse_attn ............ [NO] sparse_attn....... ............[OKAY] -[NO] .......transformer [OKAY]............ - [NO] transformer....... ............[OKAY] -[NO] ....... [OKAY]stochastic_transformer - .stochastic_transformer [NO] ........ [OKAY] -[NO] ....... [OKAY] --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adam fused_adam............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_lamb fused_lamb............. .............[NO] [NO]....... .......[OKAY] - [OKAY] -ninja .................. [OKAY]sparse_attn - sparse_attn............-------------------------------------------------- -[NO]............ op name ....... [NO]................ [OKAY].......installed - ..[OKAY]transformer - compatible............ - transformer--------------------------------------------------[NO] -............ .......[NO] [OKAY]....... -ninja [OKAY] -..................cpu_adamstochastic_transformer ...............[OKAY]stochastic_transformer .[NO] ....... - [NO].[OKAY]-------------------------------------------------- - - .......[NO]op name [OKAY]....................... -fused_adam installed[OKAY]............. -[NO] ....... [OKAY].. - compatible -fused_lamb-------------------------------------------------- -............. [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... sparse_attn ............[OKAY] -[NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -fused_adam .............stochastic_transformer [NO] ........ [NO][OKAY] ....... [OKAY] - -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformerninja ................... [NO][OKAY] -....... --------------------------------------------------[OKAY] - -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja fused_adam.................. .............[OKAY] -[NO]ninja ....... -------------------------------------------------- ..................[OKAY] - -[OKAY] -op name --------------------------------------------------fused_lamb................ - ............. op name installed [NO] ................ ....... ..installed [OKAY] -compatible.. - compatible-------------------------------------------------- - --------------------------------------------------- -sparse_attn cpu_adam............cpu_adam [NO].............................. .......[NO] [OKAY].......[NO] - [OKAY]....... -transformer [OKAY]............ -[NO] ....... [OKAY] -fused_adam ............. stochastic_transformer[NO] .......fused_adam. [NO].............[OKAY] -.......[NO] [OKAY] fused_lamb -....... .............[OKAY] [NO] - ....... [OKAY]fused_lamb - ............. [NO] ....... [OKAY] -sparse_attn ............ [NO]sparse_attn ....... ............ [OKAY] -[NO] .......transformer ............[OKAY] -[NO] ....... transformer[OKAY] -............ [NO] .......stochastic_transformer [OKAY] -. [NO] .......stochastic_transformer [OKAY] -. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... ninja[OKAY] - .................. [OKAY]fused_lamb -ninja --------------------------------------------------............. - ..................op name[NO] [OKAY] ................ -....... installed -------------------------------------------------- [OKAY] -.. -op name compatible................ - installed-------------------------------------------------- .. - compatible -sparse_attn-------------------------------------------------- - ............ cpu_adam[NO] ...................... [NO][OKAY] -.......cpu_adam transformer[OKAY]............... - ............[NO] [NO]....... [OKAY]....... -fused_adam[OKAY] -............. [NO] ....... stochastic_transformer[OKAY] -. [NO]fused_adam fused_lamb ....... ............. .............[OKAY][NO] - [NO]....... [OKAY]....... - [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer sparse_attn............ ............ [NO][NO] .............. [OKAY][OKAY] - -transformerstochastic_transformer ............ .[NO] .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja .................. [OKAY] ---------------------------------------------------fused_adam - .............op name [NO]................ .......installed [OKAY].. - compatible -fused_lamb-------------------------------------------------- -............. [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -sparse_attn ............ [NO]fused_adam .................... [OKAY][NO] - ....... [OKAY]transformer - ............ [NO]fused_lamb .................... [OKAY][NO] - ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] .......ninja [OKAY] -.................. [OKAY] --------------------------------------------------- -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op name ................ installed fused_adam.. compatible............. - [NO]-------------------------------------------------- -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -....... [OKAY] -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_lamb cpu_adam............. ...............[NO] [NO]....... .......[OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -[OKAY] -fused_lamb fused_lamb............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_adamsparse_attn ......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformer fused_lamb............ .............[NO] [NO]....... .......[OKAY] -[OKAY] -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] -stochastic_transformer . [NO] ....... [OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY]ninja - --------------------------------------------------.................. - [OKAY]op name - ................-------------------------------------------------- -installed op name.. ................compatible -installed --------------------------------------------------.. - compatible --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam ...................... [OKAY][NO] - ....... [OKAY] -fused_adam ............. [NO]fused_adam .................... [OKAY][NO] - ....... [OKAY] -fused_lamb .............fused_lamb [NO]............. .......[NO] [OKAY]....... - [OKAY] -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -transformer ............transformer [NO]............ .......[NO] [OKAY]....... - [OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja --------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja .................. [OKAY] -fused_adam --------------------------------------------------............. - [NO]op name ....................... [OKAY]installed - .. compatible -fused_lamb-------------------------------------------------- -............. [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY]fused_adam - ............. [NO]transformer ................... [OKAY][NO] - ....... [OKAY] -ninjafused_lamb stochastic_transformer............................... [NO].[OKAY] ....... -[NO] [OKAY]....... --------------------------------------------------- [OKAY] - -op name ................ installed ninja.. compatible.................. - [OKAY]-------------------------------------------------- -sparse_attn - --------------------------------------------------............ - [NO]op name .......cpu_adam................ [OKAY] ............... -installed transformer [NO].............. .......compatible[NO] - [OKAY]--------------------------------------------------....... - - [OKAY] -stochastic_transformer cpu_adam .fused_adam ............... [NO] ............. [NO]....... [NO].......[OKAY] -.......[OKAY] -[OKAY] -fused_lamb ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. sparse_attn[NO] ............ .......[NO] [OKAY]....... - [OKAY] -transformer ............ [NO] ....... [OKAY]sparse_attn - ............ [NO] stochastic_transformer....... [OKAY]. - [NO]transformer ................... [OKAY][NO] -ninja .................. [OKAY] - ....... [OKAY] --------------------------------------------------- -stochastic_transformer . [NO] ....... [OKAY] -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ fused_adam installed............. [NO].. compatible -.......-------------------------------------------------- -[OKAY] -ninjacpu_adamfused_lamb .............................................. [NO][OKAY][NO] -ninja -------------------------------------------------- .............. -.................. op name[OKAY][OKAY][OKAY] - -................ --------------------------------------------------- -installed op name.. ................ compatibleinstalled -fused_adam --------------------------------------------------............... - compatible[NO] - --------------------------------------------------sparse_attn....... - ............[OKAY]cpu_adam - ............... [NO]fused_lamb[NO] cpu_adam ....... ................................... [NO][OKAY][OKAY] -[NO] ....... - [OKAY].......transformer - ............[OKAY] fused_adam -ninja .................. [OKAY] -[NO] ............. [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -fused_adamsparse_attn .............fused_lamb............ stochastic_transformer[NO] .......[NO]............. . [OKAY] -.......[NO] [OKAY]....... -ninja .................. [OKAY]cpu_adam - ...............-------------------------------------------------- -fused_lamb [NO] transformer............. [OKAY] ....... -[NO] op name....... ................[OKAY] -installed .. compatible --------------------------------------------------- -[NO] ............[OKAY] -.......[NO] [OKAY]....... -fused_adam ............. [NO]cpu_adam ...................... [OKAY][NO] - ....... [OKAY] - [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -sparse_attn ............ stochastic_transformer[NO] ....... .[OKAY] -sparse_attn ............ fused_lamb[NO] .................... [NO][OKAY] -....... [OKAY]transformer -[NO] ....... sparse_attntransformer[OKAY] - ............ [NO] ....... [OKAY] -........................ [NO][NO] .............. [OKAY] -stochastic_transformer sparse_attn. ............[NO] [NO]....... .......[OKAY] -[OKAY] -[OKAY] -transformer ............stochastic_transformer [NO] ........ [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -[NO] ....... stochastic_transformer[OKAY] -. [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... ninja[NO] ......................... [OKAY][OKAY] - --------------------------------------------------- -op name ................ installed .. compatiblefused_adam - .............-------------------------------------------------- -[NO] ....... [OKAY] -fused_lamb ............. [NO] .......cpu_adam [OKAY]............... - [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... fused_adam[OKAY] -............. [NO]transformer ................... [OKAY][NO] - ....... [OKAY]fused_lamb - stochastic_transformer............. [NO]. .......[NO] [OKAY]....... [OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja ..................cpu_adam [OKAY]...............ninja - [NO]--------------------------------------------------.................. ninja....... - [OKAY] .................. -op name[OKAY] [OKAY] ---------------------------------------------------................ - - installed--------------------------------------------------op name - ..................op name installedcompatiblefused_adam................ - --------------------------------------------------installed............... - ..compatible[NO] - compatible--------------------------------------------------cpu_adam....... - - --------------------------------------------------...............[OKAY] - - [NO] .......cpu_adam fused_lamb[OKAY]............... - cpu_adam.............[NO] ...................... [NO][NO][OKAY] - ..............fused_adam [OKAY] [OKAY] -............. - [NO] ....... [OKAY] -fused_adam ............. fused_lamb[NO] fused_adam ............. .......sparse_attn............. [NO][OKAY]............ -[NO] .......[NO]fused_lamb ....... [OKAY] ....................[OKAY] - -[NO][OKAY] -.......fused_lamb [OKAY]transformer............. - ............[NO]sparse_attn [NO]................... [OKAY].......[NO] - [OKAY]....... - sparse_attn[OKAY] -............stochastic_transformer transformer [NO] .................... sparse_attn[NO][OKAY][NO] - ................... transformer.......[NO] [OKAY] - ............[OKAY]....... -[NO] [OKAY]....... - [OKAY]stochastic_transformertransformer - ............ [NO]. stochastic_transformer ....... [NO] .[OKAY]....... - [NO] [OKAY]stochastic_transformer....... - [OKAY] -. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. ninja[NO] ......................... [OKAY] -[OKAY] --------------------------------------------------- -op name ................fused_lamb installed............. .. [NO]compatible -.......-------------------------------------------------- -[OKAY] -ninjacpu_adam ............... ..................[NO] sparse_attn [OKAY] ................... - [OKAY][NO]-------------------------------------------------- - -....... op name[OKAY] ................ - installed transformer.. fused_adamcompatible............ - [NO]-------------------------------------------------- -....... [OKAY] -............. [NO] .......stochastic_transformer cpu_adam[OKAY] -................ [NO]fused_lamb[NO] ........................... [NO][OKAY] -.......[OKAY] [OKAY] - -ninjafused_adam ............................... [OKAY][NO]sparse_attn - ...................-------------------------------------------------- -[NO][OKAY] op name -....... ................[OKAY] -installedfused_lamb ..transformer ............. compatible ............ - [NO]--------------------------------------------------[NO] - .............. [OKAY][OKAY] - -cpu_adamstochastic_transformer ............... [NO]. .......[NO] [OKAY]....... - [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -fused_adam transformer............. [NO]............ .......[NO] [OKAY]....... - [OKAY] -fused_lamb ............. stochastic_transformer[NO] ....... .[OKAY] -[NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adamninja ............. [NO].................. .......[OKAY] -[OKAY] --------------------------------------------------- -ninjaninjaop namefused_lamb ............................................... .................. [OKAY][NO] -installed[OKAY]....... --------------------------------------------------.. -[OKAY] - -compatible-------------------------------------------------- -op name --------------------------------------------------- op name -................ ................installed installed.. sparse_attncpu_adam.. compatible ........................... -compatible --------------------------------------------------[NO] -[NO] - --------------------------------------------------.............. - [OKAY][OKAY] - -cpu_adam ...............transformer [NO]cpu_adam............ ......................[NO]fused_adam [OKAY][NO].................... - [NO]....... [OKAY] ....... -[OKAY] -[OKAY] -stochastic_transformerfused_adam fused_lamb . ............. ............. [NO] fused_adam[NO] [NO]....... ....... ............. [OKAY] [OKAY]....... -[NO] - [OKAY] -....... [OKAY] -fused_lamb .............fused_lamb [NO]............. .......sparse_attn[NO] [OKAY]................... - [NO][OKAY] -....... [OKAY] -transformer ............ [NO] ....... sparse_attn[OKAY] -sparse_attn............ ............[NO]stochastic_transformer [NO]....... ........[OKAY] -[NO][OKAY] -.......transformer transformer[OKAY]............ - ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer. [NO]. .......[NO] [OKAY]....... - [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninjasparse_attn ............ ..................[NO] [OKAY]....... - [OKAY]-------------------------------------------------- - -op name transformer................ ............installed [NO].. .......compatible -[OKAY] --------------------------------------------------- -stochastic_transformer . [NO] .......cpu_adam [OKAY]............... - [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY] -[OKAY] -transformer ............transformer [NO]............ .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -DeepSpeed general environment info: -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 -deepspeed install path -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -torch version .................... 1.8.1 -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -torch cuda version ............... 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ ninja[NO] ......................... [OKAY][OKAY] -ninja - --------------------------------------------------.................. stochastic_transformer -[OKAY] -op name .--------------------------------------------------................ - installed[NO]op name .................. ....... installed compatible [OKAY].. - - --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam ...................... [OKAY][NO] - ....... [OKAY] -fused_adam ............. [NO] ....... fused_adam[OKAY] -............. [NO] .......fused_lamb [OKAY]............. - [NO] ....... [OKAY]fused_lamb - ............. [NO] ....... [OKAY] -sparse_attn ............ [NO]sparse_attn ................... [OKAY][NO] - ....... transformer[OKAY] -............ [NO] .......transformer [OKAY]............ - [NO] ....... [OKAY]stochastic_transformer - . stochastic_transformer[NO] ....... .[OKAY] -[NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -async_io ............... [NO] ....... [NO] -nvcc version ..................... 11.2 -transformer_inference .. [NO] ....... [OKAY] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO]async_io ....... [OKAY] -DeepSpeed general environment info: - quantizer............... ..............[NO] [NO] ....... [OKAY] -....... [NO] --------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -transformer_inference .. [NO] ....... [OKAY] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -utils .................. [NO] ....... [OKAY] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  async_io: please install the libaio-devel package with yum ............... -[NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] ....... [NO] -transformer_inferenceasync_io .. [NO]............... ....... [OKAY] -[NO] ....... [NO]utils - .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. -------------------------------------------------- -[NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO]transformer_inference - .. [NO] ....... [OKAY] -utilstransformer_inference .................... [NO][NO] .............. [OKAY][OKAY] - -quantizerutils ................................ [NO][NO] .............. [OKAY] -[OKAY] ---------------------------------------------------quantizer - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_ioutils ................................. [NO][NO] .............. [OKAY][NO] - -quantizer .............. [NO] ....... [OKAY] -transformer_inference --------------------------------------------------.. - [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO]async_io -quantizer .............. [NO] ....... [OKAY] - ............... [NO] ....... [NO] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] .......utils [OKAY].................. - [NO] ....... [OKAY] -utils .................. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] -quantizer .............. [NO] --------------------------------------------------....... - [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]async_io ....... ...............[NO] -[NO] ....... [NO] -transformer_inference .. [NO] transformer_inference....... [OKAY].. - [NO] ....... [OKAY] -utils .................. [NO] .......utils [OKAY].................. - [NO] quantizer....... ..............[OKAY] -[NO] ....... quantizer[OKAY] -.............. [NO] ....... --------------------------------------------------[OKAY] - --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference ..  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.[NO] - ....... [OKAY] -utils .................. [NO] ....... [OKAY]async_io - ...............quantizer ..............[NO] [NO]....... .......[NO] -[OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info:['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version .................... torch install path1.8.1 - ...............torch cuda version ............... 11.1 -nvcc version .....................['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -11.2 -deepspeed install pathtorch version ............................... 1.8.1 -torch cuda version ............... 11.1['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -nvcc version deepspeed info..................... ...................11.2 -0.5.5+29bee73, 29bee73, masterdeepspeed install path - ...........deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... DeepSpeed general environment info:11.1 -nvcc version - ..................... 11.2 -deepspeed install pathtorch install path DeepSpeed general environment info:........... -...............['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed infotorch install path ................... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']...............0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.torch version .......................... torch 1.8, cuda 11.1['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']1.8.1 - - -torch cuda versiontorch version ................................... 11.11.8.1 - -nvcc version .....................torch cuda version 11.2............... - deepspeed install path11.1 -...........nvcc version ..................... 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed install pathdeepspeed info .............................. 0.5.5+29bee73, 29bee73, master -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed wheel compiled w. - deepspeed info...... ...................torch 1.8, cuda 11.1 -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizer ..............quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -DeepSpeed general environment info: -utils .................. [NO] ....... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -quantizer .............. [NO] ....... [OKAY] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master --------------------------------------------------- -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] .......transformer_inference .. [NO] [NO]....... - [OKAY] -utils .................. [NO] ....... transformer_inference[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -.. [NO]quantizer .............. [NO] .............. [OKAY][OKAY] --------------------------------------------------- - -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch version - .................... torch version1.8.1 -.................... 1.8.1torch cuda version - ...............torch cuda version 11.1............... - nvcc version11.1 -.....................nvcc version 11.2..................... - deepspeed install path11.2 -...........deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -DeepSpeed general environment info: -utils .................. [NO] ....... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -quantizer .............. [NO] ....... [OKAY] -torch cuda version ............... 11.1 --------------------------------------------------- -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] [WARNING]  async_io: please install the libaio-devel package with yum - --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch version - .................... torch version1.8.1 -.................... torch cuda version1.8.1 -............... torch cuda version11.1 -...............nvcc version 11.1..................... - nvcc version11.2 -..................... deepspeed install path11.2 -........... deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -................... deepspeed info0.5.5+29bee73, 29bee73, master -................... deepspeed wheel compiled w.0.5.5+29bee73, 29bee73, master -......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path -............... torch install path ...............['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']1.8.1 - -torch cuda versiontorch version ................................... 11.11.8.1 - -nvcc version torch cuda version..................... ...............11.2 -11.1deepspeed install path - nvcc version........... ..................... 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed install pathdeepspeed info .............................. 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w.['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -...... deepspeed infotorch 1.8, cuda 11.1 -................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... nvcc version0.5.5+29bee73, 29bee73, master -..................... deepspeed wheel compiled w.11.2 -...... torch 1.8, cuda 11.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] utils....... ..................[OKAY] -[NO] ....... [OKAY] -utils .................. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... async_io[NO] ....... ...............[NO] -[NO] ....... [NO] -transformer_inference .. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] -utils .................. utils[NO] ......................... [NO][OKAY] -....... [OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1685428.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 150 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.006 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 0.0001 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 3750000 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_bnb_optimizer ............................... False - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utils .................. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -quantizer .............. [NO] .......utils [OKAY].................. - [NO] ....... [OKAY]-------------------------------------------------- - -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc version nvcc version..................... .....................11.2 -11.2 -deepspeed install path deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version .....................DeepSpeed general environment info: 11.2 -deepspeed install path - ........... torch install path['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - deepspeed info............... ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch 1.8, cuda 11.1 - -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yumasync_io -............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -> setting tensorboard ... - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-26 12:53:25,656] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' ->>> done with dataset index builder. Compilation time: 0.301 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 20.526 seconds -time to initialize megatron (seconds): 58.485 -[after megatron is initialized] datetime: 2021-10-26 12:53:46 -building GPT model ... -[2021-10-26 12:53:46,538] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-26 12:53:46,538] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-26 12:53:46,539] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.57 GB, percent = 21.7% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-26 12:53:48,213] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-26 12:53:48,896] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-26 12:53:48,897] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-26 12:53:48,897] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.74 GB, percent = 21.8% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-26 12:53:48,898] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+29bee73, git-hash=29bee73, git-branch=master -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-26 12:53:48,935] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-26 12:53:48,935] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-26 12:53:48,935] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-26 12:53:48,935] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-26 12:53:48,936] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-26 12:53:48,936] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-26 12:53:48,936] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-26 12:53:48,936] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-26 12:53:48,936] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-26 12:53:48,936] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Time to load utils op: 0.4995734691619873 seconds -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Time to load utils op: 0.40911149978637695 seconds -Time to load utils op: 0.4050137996673584 secondsTime to load utils op: 0.40830492973327637 seconds - -Loading extension module utils... -Time to load utils op: 0.40287256240844727 seconds -Time to load utils op: 0.42329883575439453 seconds -Time to load utils op: 0.42052412033081055 seconds -Time to load utils op: 0.4234023094177246 seconds -Time to load utils op: 0.42182159423828125 seconds -Loading extension module utils...Loading extension module utils... -Loading extension module utils... - -Time to load utils op: 0.5249903202056885 secondsTime to load utils op: 0.5217041969299316 seconds - -Time to load utils op: 0.5240767002105713 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... -Loading extension module utils... - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... -Loading extension module utils... -Loading extension module utils... - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils... -Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.5327374935150146 seconds -Time to load utils op: 0.5375947952270508 seconds -Time to load utils op: 0.537921667098999 seconds -Time to load utils op: 0.5344822406768799 secondsTime to load utils op: 0.5359766483306885 seconds -Time to load utils op: 0.5338268280029297 seconds - -Time to load utils op: 0.5345416069030762 secondsTime to load utils op: 0.5370991230010986 secondsTime to load utils op: 0.533717155456543 seconds - - -Time to load utils op: 0.5366406440734863 seconds -Time to load utils op: 0.534111738204956 seconds -Time to load utils op: 0.5335054397583008 seconds -Time to load utils op: 0.5325021743774414 secondsTime to load utils op: 0.5398223400115967 seconds - -Time to load utils op: 0.5318582057952881 secondsTime to load utils op: 0.5314590930938721 seconds - -Time to load utils op: 0.5390698909759521 secondsTime to load utils op: 0.5298590660095215 secondsTime to load utils op: 0.5361766815185547 seconds - -Time to load utils op: 0.533653736114502 seconds - -Time to load utils op: 0.5318496227264404 secondsTime to load utils op: 0.5281672477722168 seconds - -Time to load utils op: 0.5342938899993896 seconds -Time to load utils op: 0.540513277053833 seconds -Time to load utils op: 0.5349335670471191 seconds -Time to load utils op: 0.5346939563751221 seconds -Time to load utils op: 0.5327146053314209 seconds -Time to load utils op: 0.5371482372283936 seconds -Time to load utils op: 0.5326132774353027 secondsTime to load utils op: 0.5357053279876709 secondsTime to load utils op: 0.5405941009521484 seconds - - -Time to load utils op: 0.5354864597320557 secondsTime to load utils op: 0.5411417484283447 secondsTime to load utils op: 0.5352799892425537 seconds - -Time to load utils op: 0.5369434356689453 seconds - -Time to load utils op: 0.5359611511230469 seconds -Time to load utils op: 0.536736249923706 secondsTime to load utils op: 0.5366921424865723 secondsTime to load utils op: 0.5345170497894287 seconds - -Time to load utils op: 0.5377819538116455 secondsTime to load utils op: 0.532606840133667 seconds - - -Time to load utils op: 0.5364007949829102 seconds -Time to load utils op: 0.5319750308990479 secondsTime to load utils op: 0.5320076942443848 secondsTime to load utils op: 0.5363128185272217 seconds - - -Time to load utils op: 0.5374774932861328 seconds -Time to load utils op: 0.5347754955291748 seconds -Time to load utils op: 0.5370883941650391 seconds -Time to load utils op: 0.538801908493042 seconds -Time to load utils op: 0.5378885269165039 seconds -Time to load utils op: 0.527846097946167 seconds -Time to load utils op: 0.5358877182006836 seconds -Time to load utils op: 0.5327508449554443 seconds -Time to load utils op: 0.5255348682403564 secondsTime to load utils op: 0.5272719860076904 seconds - -Time to load utils op: 0.5273990631103516 seconds -Time to load utils op: 0.5345849990844727 seconds -Time to load utils op: 0.5344762802124023 secondsTime to load utils op: 0.5382015705108643 seconds - -Time to load utils op: 0.538506031036377 seconds -Time to load utils op: 0.5250883102416992 seconds -Time to load utils op: 0.5313088893890381 secondsTime to load utils op: 0.5285022258758545 seconds - -Time to load utils op: 0.5313599109649658 seconds -Time to load utils op: 0.538057804107666 seconds -Time to load utils op: 0.5311136245727539 secondsTime to load utils op: 0.5273358821868896 seconds - -Time to load utils op: 0.5314822196960449 seconds -Time to load utils op: 0.5355744361877441 seconds -Time to load utils op: 0.5315499305725098 seconds -Time to load utils op: 0.5398142337799072 seconds -Time to load utils op: 0.5351064205169678 secondsTime to load utils op: 0.5363726615905762 seconds - -Time to load utils op: 0.5300724506378174 seconds -Time to load utils op: 0.5314733982086182 seconds -Time to load utils op: 0.5324292182922363 secondsTime to load utils op: 0.5319859981536865 seconds - -Time to load utils op: 0.5355434417724609 seconds -Time to load utils op: 0.5301337242126465 secondsTime to load utils op: 0.531557559967041 seconds - -Time to load utils op: 0.5329475402832031 secondsTime to load utils op: 0.5325002670288086 secondsTime to load utils op: 0.531975507736206 seconds - - -Time to load utils op: 0.5327925682067871 seconds -Time to load utils op: 0.5314586162567139 seconds -Time to load utils op: 0.5312070846557617 seconds -Time to load utils op: 0.5363404750823975 seconds -Time to load utils op: 0.532137393951416 secondsTime to load utils op: 0.5314836502075195 seconds -Time to load utils op: 0.5287303924560547 seconds -Time to load utils op: 0.5301804542541504 seconds -Time to load utils op: 0.5286159515380859 seconds - -Time to load utils op: 0.5406970977783203 secondsTime to load utils op: 0.5331935882568359 seconds - -Time to load utils op: 0.540839672088623 seconds -Time to load utils op: 0.5319252014160156 secondsTime to load utils op: 0.5392286777496338 seconds - -Time to load utils op: 0.5321564674377441 seconds -Time to load utils op: 0.5356972217559814 secondsTime to load utils op: 0.5326137542724609 secondsTime to load utils op: 0.5306172370910645 seconds - - -Time to load utils op: 0.5340151786804199 secondsTime to load utils op: 0.5420475006103516 seconds - -Time to load utils op: 0.5330972671508789 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 0.5399563312530518 secondsTime to load utils op: 0.5410826206207275 secondsTime to load utils op: 0.5434370040893555 seconds - - -Time to load utils op: 0.5475056171417236 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 0.5527706146240234 seconds -Time to load utils op: 0.5551548004150391 secondsTime to load utils op: 0.551384687423706 seconds - -Time to load utils op: 0.5458984375 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 0.5599899291992188 secondsTime to load utils op: 0.5673723220825195 seconds - -Time to load utils op: 0.5642426013946533 secondsTime to load utils op: 0.5656852722167969 seconds - -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] - -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0024662017822265625 seconds -Time to load utils op: 0.0023043155670166016 seconds -Time to load utils op: 0.0022346973419189453 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001215219497680664 secondsTime to load utils op: 0.0010044574737548828 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013508796691894531 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011055469512939453 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010607242584228516 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009818077087402344 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Time to load utils op: 0.0009844303131103516 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010077953338623047 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011496543884277344 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001363515853881836 secondsTime to load utils op: 0.0010273456573486328 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009996891021728516 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.0011005401611328125 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Time to load utils op: 0.0012888908386230469 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.0012276172637939453 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.0013797283172607422 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.00109100341796875 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010166168212890625 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011067390441894531 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Time to load utils op: 0.001039743423461914 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010590553283691406 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010600090026855469 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011928081512451172 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Time to load utils op: 0.0010037422180175781 seconds -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001222848892211914 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012845993041992188 seconds -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010268688201904297 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009877681732177734 seconds -Time to load utils op: 0.0012788772583007812 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.0010693073272705078 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.0010724067687988281 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0013093948364257812 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.0012943744659423828 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009818077087402344 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011496543884277344 seconds -Time to load utils op: 0.0009655952453613281 seconds -Time to load utils op: 0.0010230541229248047 seconds -Loading extension module utils... -Time to load utils op: 0.001211404800415039 seconds -Time to load utils op: 0.0011265277862548828 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001291036605834961 seconds -Loading extension module utils... -Time to load utils op: 0.0009751319885253906 seconds -Time to load utils op: 0.001031637191772461 seconds -Time to load utils op: 0.000982046127319336 seconds -Time to load utils op: 0.0009381771087646484 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Time to load utils op: 0.0012388229370117188 seconds -Time to load utils op: 0.0012941360473632812 seconds -Time to load utils op: 0.0010802745819091797 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0009806156158447266 seconds -Time to load utils op: 0.001260995864868164 seconds -Loading extension module utils... -Time to load utils op: 0.0010280609130859375 seconds -Time to load utils op: 0.0009777545928955078 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.00112152099609375 seconds -Time to load utils op: 0.0011768341064453125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010721683502197266 seconds -Loading extension module utils... -Time to load utils op: 0.0012142658233642578 seconds -Time to load utils op: 0.0009822845458984375 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010654926300048828 seconds -Time to load utils op: 0.0010848045349121094 seconds -Time to load utils op: 0.001039266586303711 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.0013267993927001953 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010709762573242188 seconds -Time to load utils op: 0.0013058185577392578 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.000990152359008789 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011532306671142578 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011110305786132812 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Time to load utils op: 0.001008749008178711 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009987354278564453 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009920597076416016 seconds -Time to load utils op: 0.0015659332275390625 seconds -Time to load utils op: 0.0010385513305664062 seconds -Time to load utils op: 0.0009670257568359375 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010030269622802734 seconds -Time to load utils op: 0.0012662410736083984 seconds -Loading extension module utils... -Time to load utils op: 0.0011286735534667969 seconds -Time to load utils op: 0.001397848129272461 seconds -Time to load utils op: 0.0014357566833496094 seconds -Time to load utils op: 0.0013446807861328125 seconds -Time to load utils op: 0.0011067390441894531 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010423660278320312 seconds -Time to load utils op: 0.001055002212524414 seconds -Time to load utils op: 0.0012896060943603516 seconds -Time to load utils op: 0.0009505748748779297 seconds -Time to load utils op: 0.0010139942169189453 seconds -Time to load utils op: 0.0011959075927734375 seconds -Time to load utils op: 0.0013148784637451172 seconds -Time to load utils op: 0.0010657310485839844 seconds -Time to load utils op: 0.0010836124420166016 seconds -Time to load utils op: 0.0012965202331542969 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Time to load utils op: 0.001056671142578125 seconds - -Time to load utils op: 0.0012798309326171875 seconds -Time to load utils op: 0.0010733604431152344 seconds -Time to load utils op: 0.0014564990997314453 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.00106048583984375 seconds -Loading extension module utils... -Time to load utils op: 0.0011210441589355469 seconds -Time to load utils op: 0.0009739398956298828 seconds -Time to load utils op: 0.0011737346649169922 seconds -Time to load utils op: 0.0010440349578857422 seconds -Time to load utils op: 0.0013265609741210938 secondsTime to load utils op: 0.0013506412506103516 seconds - -Time to load utils op: 0.0012309551239013672 secondsTime to load utils op: 0.0013492107391357422 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011882781982421875 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010600090026855469 seconds -Time to load utils op: 0.00104522705078125 seconds -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.0009853839874267578 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Time to load utils op: 0.0010285377502441406 seconds -Time to load utils op: 0.0010232925415039062 secondsTime to load utils op: 0.0012466907501220703 seconds - -Time to load utils op: 0.0011742115020751953 secondsTime to load utils op: 0.0011763572692871094 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009822845458984375 secondsTime to load utils op: 0.001026153564453125 seconds - -Time to load utils op: 0.0013096332550048828 seconds -Time to load utils op: 0.0011780261993408203 seconds -Time to load utils op: 0.0014121532440185547 seconds -Time to load utils op: 0.0011546611785888672 seconds -Time to load utils op: 0.002161264419555664 seconds -Time to load utils op: 0.0021636486053466797 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.004741668701171875 seconds -Time to load utils op: 0.004721403121948242 seconds -[2021-10-26 12:53:51,192] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-26 12:53:51,193] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-26 12:53:51,193] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.76 GB, percent = 21.8% -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012631416320800781 seconds -Time to load utils op: 0.0010833740234375 seconds -Time to load utils op: 0.0010573863983154297 secondsTime to load utils op: 0.0009975433349609375 seconds - -[2021-10-26 12:53:51,245] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-26 12:53:51,246] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-26 12:53:51,246] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.76 GB, percent = 21.8% -[2021-10-26 12:53:51,246] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-26 12:53:51,280] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-26 12:53:51,281] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-26 12:53:51,281] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.76 GB, percent = 21.8% -[2021-10-26 12:53:51,281] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-26 12:53:51,281] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-26 12:53:51,281] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-26 12:53:51,281] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-26 12:53:51,282] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-26 12:53:51,282] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-26 12:53:51,283] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-26 12:53:51,284] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-26 12:53:51,284] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-26 12:53:51,284] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-26 12:53:51,284] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-26 12:53:51,284] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0007944107055664062 seconds -[2021-10-26 12:53:51,285] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-26 12:53:51,686] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) - > using checkpoint value 0.0001 for learning rate - > using checkpoint value 6e-06 for minimum learning rate - > using checkpoint value 3750000 for warmup iterations - > using checkpoint value 600000000 for total number of iterations - > using checkpoint value cosine for decay style -successfully loaded 1 ZeRO state_dicts for rank 58 -successfully loaded 1 ZeRO state_dicts for rank 43 -successfully loaded 1 ZeRO state_dicts for rank 37 -successfully loaded 1 ZeRO state_dicts for rank 48 -successfully loaded 1 ZeRO state_dicts for rank 44 -successfully loaded 1 ZeRO state_dicts for rank 56 -successfully loaded 1 ZeRO state_dicts for rank 60 -successfully loaded 1 ZeRO state_dicts for rank 41 -successfully loaded 1 ZeRO state_dicts for rank 40 -successfully loaded 1 ZeRO state_dicts for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 34 -successfully loaded 1 ZeRO state_dicts for rank 96 -successfully loaded 1 ZeRO state_dicts for rank 64 -successfully loaded 1 ZeRO state_dicts for rank 65 -successfully loaded 1 ZeRO state_dicts for rank 51 -successfully loaded 1 ZeRO state_dicts for rank 62 -successfully loaded 1 ZeRO state_dicts for rank 36 -successfully loaded 1 ZeRO state_dicts for rank 32 -successfully loaded 1 ZeRO state_dicts for rank 59 -successfully loaded 1 ZeRO state_dicts for rank 57 -successfully loaded 1 ZeRO state_dicts for rank 42 -successfully loaded 1 ZeRO state_dicts for rank 114 -successfully loaded 1 ZeRO state_dicts for rank 27 -successfully loaded 1 ZeRO state_dicts for rank 102 -successfully loaded 1 ZeRO state_dicts for rank 39 -successfully loaded 1 ZeRO state_dicts for rank 120 -successfully loaded 1 ZeRO state_dicts for rank 49 -successfully loaded 1 ZeRO state_dicts for rank 46 -successfully loaded 1 ZeRO state_dicts for rank 45 -successfully loaded 1 ZeRO state_dicts for rank 47 -successfully loaded 1 ZeRO state_dicts for rank 33 -successfully loaded 1 ZeRO state_dicts for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 35 -successfully loaded 1 ZeRO state_dicts for rank 53 -successfully loaded 1 ZeRO state_dicts for rank 50 -successfully loaded 1 ZeRO state_dicts for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 38 -successfully loaded 1 ZeRO state_dicts for rank 110 -loading 1 zero partition checkpoints for rank 44 -successfully loaded 1 ZeRO state_dicts for rank 66 -successfully loaded 1 ZeRO state_dicts for rank 31 -successfully loaded 1 ZeRO state_dicts for rank 97 -successfully loaded 1 ZeRO state_dicts for rank 20 -successfully loaded 1 ZeRO state_dicts for rank 54 -loading 1 zero partition checkpoints for rank 56 -successfully loaded 1 ZeRO state_dicts for rank 25 -successfully loaded 1 ZeRO state_dicts for rank 24 -successfully loaded 1 ZeRO state_dicts for rank 70 -successfully loaded 1 ZeRO state_dicts for rank 108 -successfully loaded 1 ZeRO state_dicts for rank 111 -successfully loaded 1 ZeRO state_dicts for rank 28 -successfully loaded 1 ZeRO state_dicts for rank 113 -loading 1 zero partition checkpoints for rank 60 -successfully loaded 1 ZeRO state_dicts for rank 61 -successfully loaded 1 ZeRO state_dicts for rank 63 -successfully loaded 1 ZeRO state_dicts for rank 115 -successfully loaded 1 ZeRO state_dicts for rank 9 -successfully loaded 1 ZeRO state_dicts for rank 104 -successfully loaded 1 ZeRO state_dicts for rank 26 -successfully loaded 1 ZeRO state_dicts for rank 86 -successfully loaded 1 ZeRO state_dicts for rank 52 -successfully loaded 1 ZeRO state_dicts for rank 23 -successfully loaded 1 ZeRO state_dicts for rank 11 -successfully loaded 1 ZeRO state_dicts for rank 116 -loading 1 zero partition checkpoints for rank 58 -successfully loaded 1 ZeRO state_dicts for rank 4 -successfully loaded 1 ZeRO state_dicts for rank 55 -successfully loaded 1 ZeRO state_dicts for rank 118 -successfully loaded 1 ZeRO state_dicts for rank 6 -successfully loaded 1 ZeRO state_dicts for rank 22 -successfully loaded 1 ZeRO state_dicts for rank 71 -successfully loaded 1 ZeRO state_dicts for rank 76 -successfully loaded 1 ZeRO state_dicts for rank 107 -successfully loaded 1 ZeRO state_dicts for rank 82 -successfully loaded 1 ZeRO state_dicts for rank 101 -successfully loaded 1 ZeRO state_dicts for rank 13 -successfully loaded 1 ZeRO state_dicts for rank 119 -loading 1 zero partition checkpoints for rank 64 -successfully loaded 1 ZeRO state_dicts for rank 83 -loading 1 zero partition checkpoints for rank 62 -successfully loaded 1 ZeRO state_dicts for rank 29 -successfully loaded 1 ZeRO state_dicts for rank 84 -successfully loaded 1 ZeRO state_dicts for rank 7 -loading 1 zero partition checkpoints for rank 59 -successfully loaded 1 ZeRO state_dicts for rank 5 -successfully loaded 1 ZeRO state_dicts for rank 85 -successfully loaded 1 ZeRO state_dicts for rank 77 -successfully loaded 1 ZeRO state_dicts for rank 98 -successfully loaded 1 ZeRO state_dicts for rank 72 -successfully loaded 1 ZeRO state_dicts for rank 109 -successfully loaded 1 ZeRO state_dicts for rank 112 -loading 1 zero partition checkpoints for rank 51 -loading 1 zero partition checkpoints for rank 41 -loading 1 zero partition checkpoints for rank 37 -successfully loaded 1 ZeRO state_dicts for rank 80 -successfully loaded 1 ZeRO state_dicts for rank 122 -successfully loaded 1 ZeRO state_dicts for rank 87 -loading 1 zero partition checkpoints for rank 48 -successfully loaded 1 ZeRO state_dicts for rank 117 -successfully loaded 1 ZeRO state_dicts for rank 69 -loading 1 zero partition checkpoints for rank 27 -successfully loaded 1 ZeRO state_dicts for rank 103 -successfully loaded 1 ZeRO state_dicts for rank 89 -loading 1 zero partition checkpoints for rank 43 -successfully loaded 1 ZeRO state_dicts for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 10 -successfully loaded 1 ZeRO state_dicts for rank 94 -successfully loaded 1 ZeRO state_dicts for rank 88 -successfully loaded 1 ZeRO state_dicts for rank 92 -successfully loaded 1 ZeRO state_dicts for rank 90 -successfully loaded 1 ZeRO state_dicts for rank 8 -loading 1 zero partition checkpoints for rank 120 -successfully loaded 1 ZeRO state_dicts for rank 95 -loading 1 zero partition checkpoints for rank 114 -loading 1 zero partition checkpoints for rank 42 -successfully loaded 1 ZeRO state_dicts for rank 30 -successfully loaded 1 ZeRO state_dicts for rank 73 -successfully loaded 1 ZeRO state_dicts for rank 93 -loading 1 zero partition checkpoints for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 74 -successfully loaded 1 ZeRO state_dicts for rank 15 -successfully loaded 1 ZeRO state_dicts for rank 21 -loading 1 zero partition checkpoints for rank 47 -successfully loaded 1 ZeRO state_dicts for rank 121 -loading 1 zero partition checkpoints for rank 40 -successfully loaded 1 ZeRO state_dicts for rank 79 -successfully loaded 1 ZeRO state_dicts for rank 91 -successfully loaded 1 ZeRO state_dicts for rank 123 -successfully loaded 1 ZeRO state_dicts for rank 75 -successfully loaded 1 ZeRO state_dicts for rank 12 -loading 1 zero partition checkpoints for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 14 -successfully loaded 1 ZeRO state_dicts for rank 78 -loading 1 zero partition checkpoints for rank 38 -loading 1 zero partition checkpoints for rank 50 -loading 1 zero partition checkpoints for rank 102 -loading 1 zero partition checkpoints for rank 96 -successfully loaded 1 ZeRO state_dicts for rank 106 -loading 1 zero partition checkpoints for rank 35 -loading 1 zero partition checkpoints for rank 34 -loading 1 zero partition checkpoints for rank 97 -loading 1 zero partition checkpoints for rank 65 -loading 1 zero partition checkpoints for rank 66 -loading 1 zero partition checkpoints for rank 25 -loading 1 zero partition checkpoints for rank 70 -loading 1 zero partition checkpoints for rank 31 -loading 1 zero partition checkpoints for rank 36 -loading 1 zero partition checkpoints for rank 33 -loading 1 zero partition checkpoints for rank 57 -loading 1 zero partition checkpoints for rank 32 -successfully loaded 1 ZeRO state_dicts for rank 81 -loading 1 zero partition checkpoints for rank 54 -loading 1 zero partition checkpoints for rank 86 -successfully loaded 1 ZeRO state_dicts for rank 124 -successfully loaded 1 ZeRO state_dicts for rank 125 -loading 1 zero partition checkpoints for rank 9 -loading 1 zero partition checkpoints for rank 76 -loading 1 zero partition checkpoints for rank 111 -loading 1 zero partition checkpoints for rank 39 -loading 1 zero partition checkpoints for rank 49 -loading 1 zero partition checkpoints for rank 101 -loading 1 zero partition checkpoints for rank 46 -loading 1 zero partition checkpoints for rank 45 -loading 1 zero partition checkpoints for rank 52 -successfully loaded 1 ZeRO state_dicts for rank 127 -loading 1 zero partition checkpoints for rank 4 -successfully loaded 1 ZeRO state_dicts for rank 105 -successfully loaded 1 ZeRO state_dicts for rank 1 -loading 1 zero partition checkpoints for rank 23 -loading 1 zero partition checkpoints for rank 116 -loading 1 zero partition checkpoints for rank 80 -loading 1 zero partition checkpoints for rank 53 -loading 1 zero partition checkpoints for rank 112 -loading 1 zero partition checkpoints for rank 83 -loading 1 zero partition checkpoints for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 126 -loading 1 zero partition checkpoints for rank 89 -loading 1 zero partition checkpoints for rank 99 -loading 1 zero partition checkpoints for rank 110 -loading 1 zero partition checkpoints for rank 118 -successfully loaded 1 ZeRO state_dicts for rank 0 -loading 1 zero partition checkpoints for rank 24 -loading 1 zero partition checkpoints for rank 29 -loading 1 zero partition checkpoints for rank 61 -loading 1 zero partition checkpoints for rank 20 -successfully loaded 1 ZeRO state_dicts for rank 3 -loading 1 zero partition checkpoints for rank 63 -loading 1 zero partition checkpoints for rank 108 -loading 1 zero partition checkpoints for rank 8 -loading 1 zero partition checkpoints for rank 28 -loading 1 zero partition checkpoints for rank 90 -loading 1 zero partition checkpoints for rank 73 -loading 1 zero partition checkpoints for rank 84 -loading 1 zero partition checkpoints for rank 5 -loading 1 zero partition checkpoints for rank 21 -loading 1 zero partition checkpoints for rank 113 -loading 1 zero partition checkpoints for rank 115 -loading 1 zero partition checkpoints for rank 26 -loading 1 zero partition checkpoints for rank 109 -loading 1 zero partition checkpoints for rank 104 -loading 1 zero partition checkpoints for rank 92 -loading 1 zero partition checkpoints for rank 123 -successfully loaded 1 ZeRO state_dicts for rank 2 -loading 1 zero partition checkpoints for rank 75 -loading 1 zero partition checkpoints for rank 11 -loading 1 zero partition checkpoints for rank 55 -loading 1 zero partition checkpoints for rank 78 -loading 1 zero partition checkpoints for rank 6 -loading 1 zero partition checkpoints for rank 71 -loading 1 zero partition checkpoints for rank 93 -loading 1 zero partition checkpoints for rank 106 -loading 1 zero partition checkpoints for rank 22 -loading 1 zero partition checkpoints for rank 107 -loading 1 zero partition checkpoints for rank 82 -loading 1 zero partition checkpoints for rank 15 -loading 1 zero partition checkpoints for rank 119 -loading 1 zero partition checkpoints for rank 13 -loading 1 zero partition checkpoints for rank 7 -loading 1 zero partition checkpoints for rank 98 -loading 1 zero partition checkpoints for rank 85 -loading 1 zero partition checkpoints for rank 77 -loading 1 zero partition checkpoints for rank 72 -loading 1 zero partition checkpoints for rank 122 -loading 1 zero partition checkpoints for rank 14 -loading 1 zero partition checkpoints for rank 69 -loading 1 zero partition checkpoints for rank 87 -loading 1 zero partition checkpoints for rank 103 -loading 1 zero partition checkpoints for rank 117 -loading 1 zero partition checkpoints for rank 10 -loading 1 zero partition checkpoints for rank 94 -loading 1 zero partition checkpoints for rank 88 -loading 1 zero partition checkpoints for rank 95 -loading 1 zero partition checkpoints for rank 105 -loading 1 zero partition checkpoints for rank 30 -loading 1 zero partition checkpoints for rank 74 -loading 1 zero partition checkpoints for rank 79 -loading 1 zero partition checkpoints for rank 121 -loading 1 zero partition checkpoints for rank 91 -loading 1 zero partition checkpoints for rank 12 -loading 1 zero partition checkpoints for rank 124 -loading 1 zero partition checkpoints for rank 125 -loading 1 zero partition checkpoints for rank 81 -loading 1 zero partition checkpoints for rank 2 -loading 1 zero partition checkpoints for rank 127 -loading 1 zero partition checkpoints for rank 1 -loading 1 zero partition checkpoints for rank 126 -loading 1 zero partition checkpoints for rank 0 - checkpoint version 3.0 -loading 1 zero partition checkpoints for rank 3 -successfully loaded 1 ZeRO state_dicts for rank 16 -loading 1 zero partition checkpoints for rank 16 -successfully loaded 1 ZeRO state_dicts for rank 17 -loading 1 zero partition checkpoints for rank 17 -successfully loaded 1 ZeRO state_dicts for rank 18 -loading 1 zero partition checkpoints for rank 18 -successfully loaded 1 ZeRO state_dicts for rank 19 -loading 1 zero partition checkpoints for rank 19 - successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints at iteration 1361 -time (ms) | load-checkpoint: 37075.21 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.22432 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters: 103.3650944 - - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-26 12:54:28 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 20008960 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.129304 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.303 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.430 seconds - total number of samples: 20781483 - total number of epochs: 3 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.123 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-26 12:54:34 -done with setup ... -training ... -time (ms) | model-and-optimizer-setup: 42368.84 | train/valid/test-data-iterators-setup: 4824.13 -Number of parameters: 125.2213504 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-26 12:54:34 -[2021-10-26 12:54:34,570] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-26 12:54:34,570] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-26 12:54:34,570] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-26 12:54:34,570] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-26 12:54:34,571] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -[Rank 2] (after 1362 iterations) memory (MB) | allocated: 13206.8603515625 | max allocated: 20670.8466796875 | reserved: 24440.0 | max reserved: 24440.0 -[Rank 126] (after 1362 iterations) memory (MB) | allocated: 13095.7001953125 | max allocated: 20559.30615234375 | reserved: 24408.0 | max reserved: 24408.0 -[Rank 6] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20108.0 | max reserved: 20108.0 -[Rank 10] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 14] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 18] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 30] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 26] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 22] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 34] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 42] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 50] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 46] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 54] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 38] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 58] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 3] (after 1362 iterations) memory (MB) | allocated: 13206.318359375 | max allocated: 20670.3046875 | reserved: 24440.0 | max reserved: 24440.0 -[Rank 62] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 66] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 7] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20108.0 | max reserved: 20108.0 -[Rank 11] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 15] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 19] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 23] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 27] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 31] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 35] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 39] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 47] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 43] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 51] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 55] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 59] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 63] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 67] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 75] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 74] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 79] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 0] (after 1362 iterations) memory (MB) | allocated: 13206.87109375 | max allocated: 20670.419921875 | reserved: 24440.0 | max reserved: 24440.0 -[Rank 78] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 83] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20088.0 | max reserved: 20088.0[Rank 82] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20088.0 | max reserved: 20088.0 - -[Rank 124] (after 1362 iterations) memory (MB) | allocated: 13095.7001953125 | max allocated: 20559.30615234375 | reserved: 24408.0 | max reserved: 24408.0 -[Rank 87] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20088.0 | max reserved: 20088.0[Rank 86] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20088.0 | max reserved: 20088.0 - -[Rank 91] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 71] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20092.0 | max reserved: 20092.0[Rank 70] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20092.0 | max reserved: 20092.0 - -[Rank 90] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 94] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20086.0 | max reserved: 20086.0[Rank 95] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20086.0 | max reserved: 20086.0 - -[Rank 103] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 16] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 98] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 102] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 20] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 99] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 106] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 24] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 107] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 28] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 5] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20108.0 | max reserved: 20108.0 -[Rank 1] (after 1362 iterations) memory (MB) | allocated: 13204.619140625 | max allocated: 20668.80517578125 | reserved: 24440.0 | max reserved: 24440.0 -[Rank 9] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 125] (after 1362 iterations) memory (MB) | allocated: 13096.8115234375 | max allocated: 20560.41748046875 | reserved: 24408.0 | max reserved: 24408.0 -[Rank 17] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 4] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20108.0 | max reserved: 20108.0 -[Rank 110] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20082.0 | max reserved: 20082.0[Rank 111] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20082.0 | max reserved: 20082.0 - -[Rank 119] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 115] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20080.0 | max reserved: 20080.0[Rank 114] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20080.0 | max reserved: 20080.0 - -[Rank 13] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 118] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 44] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 36] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 8] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 123] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 122] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 21] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 25] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 52] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 40] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 29] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 33] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 56] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 12] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 37] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 48] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 64] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 41] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 60] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 45] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 53] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 76] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 49] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 72] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 57] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 61] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 32] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 65] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 84] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 69] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 88] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 73] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 96] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 92] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 77] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 100] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 68] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 81] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 108] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 104] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 89] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 93] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 85] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 112] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 120] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 97] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 116] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 101] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 80] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 105] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 117] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 109] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 113] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 121] (after 1362 iterations) memory (MB) | allocated: 10787.91064453125 | max allocated: 16948.09228515625 | reserved: 20078.0 | max reserved: 20078.0 - iteration 1362/ 292968 | consumed samples: 2789376 | consumed tokens: 272400384 | elapsed time per iteration (ms): 174548.0 | learning rate: 7.438E-05 | global batch size: 2048 | lm loss: 4.275390E+00 | loss scale: 16384.0 | grad norm: 9502.678 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -[Rank 127] (after 1362 iterations) memory (MB) | allocated: 13096.255859375 | max allocated: 20559.86181640625 | reserved: 24408.0 | max reserved: 24408.0 -time (ms) - iteration 1363/ 292968 | consumed samples: 2791424 | consumed tokens: 272678912 | elapsed time per iteration (ms): 105516.5 | learning rate: 7.444E-05 | global batch size: 2048 | lm loss: 4.314633E+00 | loss scale: 16384.0 | grad norm: 21415.188 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1364/ 292968 | consumed samples: 2793472 | consumed tokens: 272957440 | elapsed time per iteration (ms): 103696.0 | learning rate: 7.449E-05 | global batch size: 2048 | lm loss: 4.352754E+00 | loss scale: 16384.0 | grad norm: 19744.270 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1365/ 292968 | consumed samples: 2795520 | consumed tokens: 273235968 | elapsed time per iteration (ms): 108156.1 | learning rate: 7.455E-05 | global batch size: 2048 | lm loss: 4.330089E+00 | loss scale: 16384.0 | grad norm: 16622.189 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1366/ 292968 | consumed samples: 2797568 | consumed tokens: 273514496 | elapsed time per iteration (ms): 102368.3 | learning rate: 7.460E-05 | global batch size: 2048 | lm loss: 4.377729E+00 | loss scale: 16384.0 | grad norm: 21785.636 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1367/ 292968 | consumed samples: 2799616 | consumed tokens: 273793024 | elapsed time per iteration (ms): 102979.9 | learning rate: 7.466E-05 | global batch size: 2048 | lm loss: 4.329674E+00 | loss scale: 16384.0 | grad norm: 15815.214 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1368/ 292968 | consumed samples: 2801664 | consumed tokens: 274071552 | elapsed time per iteration (ms): 113481.5 | learning rate: 7.471E-05 | global batch size: 2048 | lm loss: 4.336274E+00 | loss scale: 16384.0 | grad norm: 17530.632 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1369/ 292968 | consumed samples: 2803712 | consumed tokens: 274350080 | elapsed time per iteration (ms): 103542.0 | learning rate: 7.477E-05 | global batch size: 2048 | lm loss: 4.293261E+00 | loss scale: 16384.0 | grad norm: 12973.838 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1370/ 292968 | consumed samples: 2805760 | consumed tokens: 274628608 | elapsed time per iteration (ms): 103541.3 | learning rate: 7.482E-05 | global batch size: 2048 | lm loss: 4.273692E+00 | loss scale: 16384.0 | grad norm: 9974.317 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1371/ 292968 | consumed samples: 2807808 | consumed tokens: 274907136 | elapsed time per iteration (ms): 102747.3 | learning rate: 7.487E-05 | global batch size: 2048 | lm loss: 4.269045E+00 | loss scale: 16384.0 | grad norm: 11702.248 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1372/ 292968 | consumed samples: 2809856 | consumed tokens: 275185664 | elapsed time per iteration (ms): 107680.1 | learning rate: 7.493E-05 | global batch size: 2048 | lm loss: 4.288945E+00 | loss scale: 16384.0 | grad norm: 11059.643 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1373/ 292968 | consumed samples: 2811904 | consumed tokens: 275464192 | elapsed time per iteration (ms): 112170.3 | learning rate: 7.498E-05 | global batch size: 2048 | lm loss: 4.258106E+00 | loss scale: 16384.0 | grad norm: 9731.962 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1374/ 292968 | consumed samples: 2813952 | consumed tokens: 275742720 | elapsed time per iteration (ms): 115306.9 | learning rate: 7.504E-05 | global batch size: 2048 | lm loss: 4.231639E+00 | loss scale: 16384.0 | grad norm: 8704.465 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1375/ 292968 | consumed samples: 2816000 | consumed tokens: 276021248 | elapsed time per iteration (ms): 109108.9 | learning rate: 7.509E-05 | global batch size: 2048 | lm loss: 4.248688E+00 | loss scale: 16384.0 | grad norm: 8257.479 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1376/ 292968 | consumed samples: 2818048 | consumed tokens: 276299776 | elapsed time per iteration (ms): 103967.5 | learning rate: 7.515E-05 | global batch size: 2048 | lm loss: 4.270372E+00 | loss scale: 16384.0 | grad norm: 7142.424 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1377/ 292968 | consumed samples: 2820096 | consumed tokens: 276578304 | elapsed time per iteration (ms): 104945.2 | learning rate: 7.520E-05 | global batch size: 2048 | lm loss: 4.265023E+00 | loss scale: 16384.0 | grad norm: 7475.966 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1378/ 292968 | consumed samples: 2822144 | consumed tokens: 276856832 | elapsed time per iteration (ms): 107388.6 | learning rate: 7.526E-05 | global batch size: 2048 | lm loss: 4.264834E+00 | loss scale: 16384.0 | grad norm: 6627.965 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1379/ 292968 | consumed samples: 2824192 | consumed tokens: 277135360 | elapsed time per iteration (ms): 112163.2 | learning rate: 7.531E-05 | global batch size: 2048 | lm loss: 4.246704E+00 | loss scale: 16384.0 | grad norm: 8057.255 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1380/ 292968 | consumed samples: 2826240 | consumed tokens: 277413888 | elapsed time per iteration (ms): 105662.3 | learning rate: 7.537E-05 | global batch size: 2048 | lm loss: 4.238889E+00 | loss scale: 16384.0 | grad norm: 6924.976 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1381/ 292968 | consumed samples: 2828288 | consumed tokens: 277692416 | elapsed time per iteration (ms): 102847.2 | learning rate: 7.542E-05 | global batch size: 2048 | lm loss: 4.233212E+00 | loss scale: 16384.0 | grad norm: 7502.716 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1382/ 292968 | consumed samples: 2830336 | consumed tokens: 277970944 | elapsed time per iteration (ms): 101894.9 | learning rate: 7.548E-05 | global batch size: 2048 | lm loss: 4.240797E+00 | loss scale: 16384.0 | grad norm: 9269.516 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1383/ 292968 | consumed samples: 2832384 | consumed tokens: 278249472 | elapsed time per iteration (ms): 102190.8 | learning rate: 7.553E-05 | global batch size: 2048 | lm loss: 4.222159E+00 | loss scale: 16384.0 | grad norm: 9096.782 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1384/ 292968 | consumed samples: 2834432 | consumed tokens: 278528000 | elapsed time per iteration (ms): 103914.1 | learning rate: 7.558E-05 | global batch size: 2048 | lm loss: 4.232150E+00 | loss scale: 16384.0 | grad norm: 10928.069 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1385/ 292968 | consumed samples: 2836480 | consumed tokens: 278806528 | elapsed time per iteration (ms): 104856.0 | learning rate: 7.564E-05 | global batch size: 2048 | lm loss: 4.232363E+00 | loss scale: 16384.0 | grad norm: 12806.746 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1386/ 292968 | consumed samples: 2838528 | consumed tokens: 279085056 | elapsed time per iteration (ms): 111753.8 | learning rate: 7.569E-05 | global batch size: 2048 | lm loss: 4.272614E+00 | loss scale: 16384.0 | grad norm: 16420.638 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1387/ 292968 | consumed samples: 2840576 | consumed tokens: 279363584 | elapsed time per iteration (ms): 109025.1 | learning rate: 7.575E-05 | global batch size: 2048 | lm loss: 4.226984E+00 | loss scale: 16384.0 | grad norm: 15008.946 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1388/ 292968 | consumed samples: 2842624 | consumed tokens: 279642112 | elapsed time per iteration (ms): 105766.2 | learning rate: 7.580E-05 | global batch size: 2048 | lm loss: 4.233119E+00 | loss scale: 16384.0 | grad norm: 9467.063 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1389/ 292968 | consumed samples: 2844672 | consumed tokens: 279920640 | elapsed time per iteration (ms): 103190.1 | learning rate: 7.586E-05 | global batch size: 2048 | lm loss: 4.241223E+00 | loss scale: 16384.0 | grad norm: 8154.032 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1390/ 292968 | consumed samples: 2846720 | consumed tokens: 280199168 | elapsed time per iteration (ms): 104319.0 | learning rate: 7.591E-05 | global batch size: 2048 | lm loss: 4.235702E+00 | loss scale: 16384.0 | grad norm: 8481.463 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1391/ 292968 | consumed samples: 2848768 | consumed tokens: 280477696 | elapsed time per iteration (ms): 105334.3 | learning rate: 7.597E-05 | global batch size: 2048 | lm loss: 4.234316E+00 | loss scale: 16384.0 | grad norm: 9273.574 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1392/ 292968 | consumed samples: 2850816 | consumed tokens: 280756224 | elapsed time per iteration (ms): 102216.0 | learning rate: 7.602E-05 | global batch size: 2048 | lm loss: 4.202343E+00 | loss scale: 16384.0 | grad norm: 9477.995 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1393/ 292968 | consumed samples: 2852864 | consumed tokens: 281034752 | elapsed time per iteration (ms): 103636.3 | learning rate: 7.608E-05 | global batch size: 2048 | lm loss: 4.220612E+00 | loss scale: 16384.0 | grad norm: 8610.856 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1394/ 292968 | consumed samples: 2854912 | consumed tokens: 281313280 | elapsed time per iteration (ms): 106530.3 | learning rate: 7.613E-05 | global batch size: 2048 | lm loss: 4.231889E+00 | loss scale: 16384.0 | grad norm: 8917.931 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1395/ 292968 | consumed samples: 2856960 | consumed tokens: 281591808 | elapsed time per iteration (ms): 107497.4 | learning rate: 7.619E-05 | global batch size: 2048 | lm loss: 4.239625E+00 | loss scale: 16384.0 | grad norm: 8666.273 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1396/ 292968 | consumed samples: 2859008 | consumed tokens: 281870336 | elapsed time per iteration (ms): 106305.6 | learning rate: 7.624E-05 | global batch size: 2048 | lm loss: 4.202125E+00 | loss scale: 16384.0 | grad norm: 8305.040 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1397/ 292968 | consumed samples: 2861056 | consumed tokens: 282148864 | elapsed time per iteration (ms): 112297.7 | learning rate: 7.629E-05 | global batch size: 2048 | lm loss: 4.228557E+00 | loss scale: 16384.0 | grad norm: 7620.646 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1398/ 292968 | consumed samples: 2863104 | consumed tokens: 282427392 | elapsed time per iteration (ms): 104652.8 | learning rate: 7.635E-05 | global batch size: 2048 | lm loss: 4.222525E+00 | loss scale: 16384.0 | grad norm: 7865.839 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1399/ 292968 | consumed samples: 2865152 | consumed tokens: 282705920 | elapsed time per iteration (ms): 108403.9 | learning rate: 7.640E-05 | global batch size: 2048 | lm loss: 4.248534E+00 | loss scale: 16384.0 | grad norm: 8005.858 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1400/ 292968 | consumed samples: 2867200 | consumed tokens: 282984448 | elapsed time per iteration (ms): 105735.3 | learning rate: 7.646E-05 | global batch size: 2048 | lm loss: 4.238834E+00 | loss scale: 16384.0 | grad norm: 7538.445 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1401/ 292968 | consumed samples: 2869248 | consumed tokens: 283262976 | elapsed time per iteration (ms): 103572.8 | learning rate: 7.651E-05 | global batch size: 2048 | lm loss: 4.235280E+00 | loss scale: 16384.0 | grad norm: 6832.742 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1402/ 292968 | consumed samples: 2871296 | consumed tokens: 283541504 | elapsed time per iteration (ms): 106021.4 | learning rate: 7.657E-05 | global batch size: 2048 | lm loss: 4.212063E+00 | loss scale: 16384.0 | grad norm: 8353.157 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1403/ 292968 | consumed samples: 2873344 | consumed tokens: 283820032 | elapsed time per iteration (ms): 109954.2 | learning rate: 7.662E-05 | global batch size: 2048 | lm loss: 4.218483E+00 | loss scale: 16384.0 | grad norm: 11841.354 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1404/ 292968 | consumed samples: 2875392 | consumed tokens: 284098560 | elapsed time per iteration (ms): 106802.7 | learning rate: 7.668E-05 | global batch size: 2048 | lm loss: 4.222077E+00 | loss scale: 16384.0 | grad norm: 13820.592 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1405/ 292968 | consumed samples: 2877440 | consumed tokens: 284377088 | elapsed time per iteration (ms): 101712.3 | learning rate: 7.673E-05 | global batch size: 2048 | lm loss: 4.246716E+00 | loss scale: 16384.0 | grad norm: 14468.150 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1406/ 292968 | consumed samples: 2879488 | consumed tokens: 284655616 | elapsed time per iteration (ms): 105224.2 | learning rate: 7.679E-05 | global batch size: 2048 | lm loss: 4.234392E+00 | loss scale: 16384.0 | grad norm: 12753.276 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1407/ 292968 | consumed samples: 2881536 | consumed tokens: 284934144 | elapsed time per iteration (ms): 109099.7 | learning rate: 7.684E-05 | global batch size: 2048 | lm loss: 4.240631E+00 | loss scale: 16384.0 | grad norm: 12146.871 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1408/ 292968 | consumed samples: 2883584 | consumed tokens: 285212672 | elapsed time per iteration (ms): 119039.5 | learning rate: 7.690E-05 | global batch size: 2048 | lm loss: 4.243193E+00 | loss scale: 16384.0 | grad norm: 12934.468 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1409/ 292968 | consumed samples: 2885632 | consumed tokens: 285491200 | elapsed time per iteration (ms): 105712.3 | learning rate: 7.695E-05 | global batch size: 2048 | lm loss: 4.245343E+00 | loss scale: 16384.0 | grad norm: 8613.445 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1410/ 292968 | consumed samples: 2887680 | consumed tokens: 285769728 | elapsed time per iteration (ms): 106251.1 | learning rate: 7.700E-05 | global batch size: 2048 | lm loss: 4.244947E+00 | loss scale: 16384.0 | grad norm: 8520.048 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1411/ 292968 | consumed samples: 2889728 | consumed tokens: 286048256 | elapsed time per iteration (ms): 108902.5 | learning rate: 7.706E-05 | global batch size: 2048 | lm loss: 4.254898E+00 | loss scale: 16384.0 | grad norm: 11526.049 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1412/ 292968 | consumed samples: 2891776 | consumed tokens: 286326784 | elapsed time per iteration (ms): 103411.7 | learning rate: 7.711E-05 | global batch size: 2048 | lm loss: 4.250681E+00 | loss scale: 16384.0 | grad norm: 15713.264 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1413/ 292968 | consumed samples: 2893824 | consumed tokens: 286605312 | elapsed time per iteration (ms): 103426.1 | learning rate: 7.717E-05 | global batch size: 2048 | lm loss: 4.250299E+00 | loss scale: 16384.0 | grad norm: 15564.952 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1414/ 292968 | consumed samples: 2895872 | consumed tokens: 286883840 | elapsed time per iteration (ms): 109896.7 | learning rate: 7.722E-05 | global batch size: 2048 | lm loss: 4.217804E+00 | loss scale: 16384.0 | grad norm: 10914.826 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1415/ 292968 | consumed samples: 2897920 | consumed tokens: 287162368 | elapsed time per iteration (ms): 107058.1 | learning rate: 7.728E-05 | global batch size: 2048 | lm loss: 4.260148E+00 | loss scale: 16384.0 | grad norm: 11263.252 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1416/ 292968 | consumed samples: 2899968 | consumed tokens: 287440896 | elapsed time per iteration (ms): 113912.7 | learning rate: 7.733E-05 | global batch size: 2048 | lm loss: 4.242663E+00 | loss scale: 16384.0 | grad norm: 7779.069 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1417/ 292968 | consumed samples: 2902016 | consumed tokens: 287719424 | elapsed time per iteration (ms): 113942.4 | learning rate: 7.739E-05 | global batch size: 2048 | lm loss: 4.220640E+00 | loss scale: 16384.0 | grad norm: 10008.599 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1418/ 292968 | consumed samples: 2904064 | consumed tokens: 287997952 | elapsed time per iteration (ms): 106464.4 | learning rate: 7.744E-05 | global batch size: 2048 | lm loss: 4.230143E+00 | loss scale: 16384.0 | grad norm: 10022.388 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1419/ 292968 | consumed samples: 2906112 | consumed tokens: 288276480 | elapsed time per iteration (ms): 104883.9 | learning rate: 7.750E-05 | global batch size: 2048 | lm loss: 4.203662E+00 | loss scale: 16384.0 | grad norm: 8534.648 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1420/ 292968 | consumed samples: 2908160 | consumed tokens: 288555008 | elapsed time per iteration (ms): 112239.4 | learning rate: 7.755E-05 | global batch size: 2048 | lm loss: 4.213041E+00 | loss scale: 16384.0 | grad norm: 9035.971 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1421/ 292968 | consumed samples: 2910208 | consumed tokens: 288833536 | elapsed time per iteration (ms): 107070.5 | learning rate: 7.761E-05 | global batch size: 2048 | lm loss: 4.219098E+00 | loss scale: 16384.0 | grad norm: 9457.717 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1422/ 292968 | consumed samples: 2912256 | consumed tokens: 289112064 | elapsed time per iteration (ms): 105043.8 | learning rate: 7.766E-05 | global batch size: 2048 | lm loss: 4.228152E+00 | loss scale: 16384.0 | grad norm: 10640.947 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1423/ 292968 | consumed samples: 2914304 | consumed tokens: 289390592 | elapsed time per iteration (ms): 106210.8 | learning rate: 7.771E-05 | global batch size: 2048 | lm loss: 4.229786E+00 | loss scale: 16384.0 | grad norm: 13324.071 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1424/ 292968 | consumed samples: 2916352 | consumed tokens: 289669120 | elapsed time per iteration (ms): 103395.1 | learning rate: 7.777E-05 | global batch size: 2048 | lm loss: 4.237543E+00 | loss scale: 16384.0 | grad norm: 13860.585 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1425/ 292968 | consumed samples: 2918400 | consumed tokens: 289947648 | elapsed time per iteration (ms): 109403.7 | learning rate: 7.782E-05 | global batch size: 2048 | lm loss: 4.246883E+00 | loss scale: 16384.0 | grad norm: 16031.358 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1426/ 292968 | consumed samples: 2920448 | consumed tokens: 290226176 | elapsed time per iteration (ms): 107261.3 | learning rate: 7.788E-05 | global batch size: 2048 | lm loss: 4.244311E+00 | loss scale: 16384.0 | grad norm: 13853.196 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1427/ 292968 | consumed samples: 2922496 | consumed tokens: 290504704 | elapsed time per iteration (ms): 101914.0 | learning rate: 7.793E-05 | global batch size: 2048 | lm loss: 4.241423E+00 | loss scale: 16384.0 | grad norm: 8120.449 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1428/ 292968 | consumed samples: 2924544 | consumed tokens: 290783232 | elapsed time per iteration (ms): 105924.6 | learning rate: 7.799E-05 | global batch size: 2048 | lm loss: 4.251287E+00 | loss scale: 16384.0 | grad norm: 11225.130 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1429/ 292968 | consumed samples: 2926592 | consumed tokens: 291061760 | elapsed time per iteration (ms): 111625.2 | learning rate: 7.804E-05 | global batch size: 2048 | lm loss: 4.221348E+00 | loss scale: 16384.0 | grad norm: 8955.910 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1430/ 292968 | consumed samples: 2928640 | consumed tokens: 291340288 | elapsed time per iteration (ms): 110528.6 | learning rate: 7.810E-05 | global batch size: 2048 | lm loss: 4.237571E+00 | loss scale: 16384.0 | grad norm: 9021.480 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1431/ 292968 | consumed samples: 2930688 | consumed tokens: 291618816 | elapsed time per iteration (ms): 121116.2 | learning rate: 7.815E-05 | global batch size: 2048 | lm loss: 4.236102E+00 | loss scale: 16384.0 | grad norm: 9625.011 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1432/ 292968 | consumed samples: 2932736 | consumed tokens: 291897344 | elapsed time per iteration (ms): 117667.6 | learning rate: 7.821E-05 | global batch size: 2048 | lm loss: 4.230381E+00 | loss scale: 16384.0 | grad norm: 10906.151 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1433/ 292968 | consumed samples: 2934784 | consumed tokens: 292175872 | elapsed time per iteration (ms): 106766.2 | learning rate: 7.826E-05 | global batch size: 2048 | lm loss: 4.214566E+00 | loss scale: 16384.0 | grad norm: 10475.901 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1434/ 292968 | consumed samples: 2936832 | consumed tokens: 292454400 | elapsed time per iteration (ms): 105367.7 | learning rate: 7.832E-05 | global batch size: 2048 | lm loss: 4.215159E+00 | loss scale: 16384.0 | grad norm: 8902.812 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1435/ 292968 | consumed samples: 2938880 | consumed tokens: 292732928 | elapsed time per iteration (ms): 104596.9 | learning rate: 7.837E-05 | global batch size: 2048 | lm loss: 4.201122E+00 | loss scale: 16384.0 | grad norm: 11236.120 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1436/ 292968 | consumed samples: 2940928 | consumed tokens: 293011456 | elapsed time per iteration (ms): 103796.8 | learning rate: 7.842E-05 | global batch size: 2048 | lm loss: 4.241621E+00 | loss scale: 16384.0 | grad norm: 13822.170 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1437/ 292968 | consumed samples: 2942976 | consumed tokens: 293289984 | elapsed time per iteration (ms): 109517.1 | learning rate: 7.848E-05 | global batch size: 2048 | lm loss: 4.209274E+00 | loss scale: 16384.0 | grad norm: 12381.487 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1438/ 292968 | consumed samples: 2945024 | consumed tokens: 293568512 | elapsed time per iteration (ms): 110411.6 | learning rate: 7.853E-05 | global batch size: 2048 | lm loss: 4.182668E+00 | loss scale: 16384.0 | grad norm: 8461.110 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1439/ 292968 | consumed samples: 2947072 | consumed tokens: 293847040 | elapsed time per iteration (ms): 125011.3 | learning rate: 7.859E-05 | global batch size: 2048 | lm loss: 4.253170E+00 | loss scale: 16384.0 | grad norm: 8044.987 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1440/ 292968 | consumed samples: 2949120 | consumed tokens: 294125568 | elapsed time per iteration (ms): 127117.2 | learning rate: 7.864E-05 | global batch size: 2048 | lm loss: 4.202640E+00 | loss scale: 16384.0 | grad norm: 8995.265 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1441/ 292968 | consumed samples: 2951168 | consumed tokens: 294404096 | elapsed time per iteration (ms): 128961.3 | learning rate: 7.870E-05 | global batch size: 2048 | lm loss: 4.202611E+00 | loss scale: 16384.0 | grad norm: 11990.677 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1442/ 292968 | consumed samples: 2953216 | consumed tokens: 294682624 | elapsed time per iteration (ms): 129502.1 | learning rate: 7.875E-05 | global batch size: 2048 | lm loss: 4.185284E+00 | loss scale: 16384.0 | grad norm: 10781.228 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1443/ 292968 | consumed samples: 2955264 | consumed tokens: 294961152 | elapsed time per iteration (ms): 119621.2 | learning rate: 7.881E-05 | global batch size: 2048 | lm loss: 4.212717E+00 | loss scale: 16384.0 | grad norm: 10992.166 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1444/ 292968 | consumed samples: 2957312 | consumed tokens: 295239680 | elapsed time per iteration (ms): 112601.2 | learning rate: 7.886E-05 | global batch size: 2048 | lm loss: 4.218211E+00 | loss scale: 16384.0 | grad norm: 11677.358 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1445/ 292968 | consumed samples: 2959360 | consumed tokens: 295518208 | elapsed time per iteration (ms): 104218.5 | learning rate: 7.892E-05 | global batch size: 2048 | lm loss: 4.196202E+00 | loss scale: 16384.0 | grad norm: 9834.030 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1446/ 292968 | consumed samples: 2961408 | consumed tokens: 295796736 | elapsed time per iteration (ms): 102655.0 | learning rate: 7.897E-05 | global batch size: 2048 | lm loss: 4.240595E+00 | loss scale: 16384.0 | grad norm: 11387.269 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1447/ 292968 | consumed samples: 2963456 | consumed tokens: 296075264 | elapsed time per iteration (ms): 102643.1 | learning rate: 7.903E-05 | global batch size: 2048 | lm loss: 4.212106E+00 | loss scale: 16384.0 | grad norm: 12999.487 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1448/ 292968 | consumed samples: 2965504 | consumed tokens: 296353792 | elapsed time per iteration (ms): 106364.7 | learning rate: 7.908E-05 | global batch size: 2048 | lm loss: 4.240885E+00 | loss scale: 16384.0 | grad norm: 10126.788 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1449/ 292968 | consumed samples: 2967552 | consumed tokens: 296632320 | elapsed time per iteration (ms): 102111.7 | learning rate: 7.913E-05 | global batch size: 2048 | lm loss: 4.204792E+00 | loss scale: 16384.0 | grad norm: 12347.191 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1450/ 292968 | consumed samples: 2969600 | consumed tokens: 296910848 | elapsed time per iteration (ms): 105589.5 | learning rate: 7.919E-05 | global batch size: 2048 | lm loss: 4.226323E+00 | loss scale: 16384.0 | grad norm: 14068.807 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1451/ 292968 | consumed samples: 2971648 | consumed tokens: 297189376 | elapsed time per iteration (ms): 111264.2 | learning rate: 7.924E-05 | global batch size: 2048 | lm loss: 4.218484E+00 | loss scale: 16384.0 | grad norm: 13129.940 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1452/ 292968 | consumed samples: 2973696 | consumed tokens: 297484288 | elapsed time per iteration (ms): 104113.1 | learning rate: 7.930E-05 | global batch size: 2048 | lm loss: 4.281313E+00 | loss scale: 16384.0 | grad norm: 11883.531 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1453/ 292968 | consumed samples: 2975744 | consumed tokens: 297779200 | elapsed time per iteration (ms): 107644.0 | learning rate: 7.935E-05 | global batch size: 2048 | lm loss: 4.251756E+00 | loss scale: 16384.0 | grad norm: 8827.322 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1454/ 292968 | consumed samples: 2977792 | consumed tokens: 298074112 | elapsed time per iteration (ms): 122327.8 | learning rate: 7.941E-05 | global batch size: 2048 | lm loss: 4.221422E+00 | loss scale: 16384.0 | grad norm: 12049.531 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1455/ 292968 | consumed samples: 2979840 | consumed tokens: 298369024 | elapsed time per iteration (ms): 111175.4 | learning rate: 7.946E-05 | global batch size: 2048 | lm loss: 4.253358E+00 | loss scale: 16384.0 | grad norm: 12089.257 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1456/ 292968 | consumed samples: 2981888 | consumed tokens: 298663936 | elapsed time per iteration (ms): 105169.2 | learning rate: 7.952E-05 | global batch size: 2048 | lm loss: 4.233932E+00 | loss scale: 16384.0 | grad norm: 18834.042 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1457/ 292968 | consumed samples: 2983936 | consumed tokens: 298958848 | elapsed time per iteration (ms): 104578.7 | learning rate: 7.957E-05 | global batch size: 2048 | lm loss: 4.245527E+00 | loss scale: 16384.0 | grad norm: 13825.694 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1458/ 292968 | consumed samples: 2985984 | consumed tokens: 299253760 | elapsed time per iteration (ms): 103232.8 | learning rate: 7.963E-05 | global batch size: 2048 | lm loss: 4.232552E+00 | loss scale: 16384.0 | grad norm: 11527.202 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1459/ 292968 | consumed samples: 2988032 | consumed tokens: 299548672 | elapsed time per iteration (ms): 102624.9 | learning rate: 7.968E-05 | global batch size: 2048 | lm loss: 4.230423E+00 | loss scale: 16384.0 | grad norm: 12961.825 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1460/ 292968 | consumed samples: 2990080 | consumed tokens: 299843584 | elapsed time per iteration (ms): 103095.6 | learning rate: 7.974E-05 | global batch size: 2048 | lm loss: 4.201604E+00 | loss scale: 16384.0 | grad norm: 11652.164 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1461/ 292968 | consumed samples: 2992128 | consumed tokens: 300138496 | elapsed time per iteration (ms): 105345.6 | learning rate: 7.979E-05 | global batch size: 2048 | lm loss: 4.233181E+00 | loss scale: 16384.0 | grad norm: 9931.745 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1462/ 292968 | consumed samples: 2994176 | consumed tokens: 300433408 | elapsed time per iteration (ms): 103873.7 | learning rate: 7.984E-05 | global batch size: 2048 | lm loss: 4.217042E+00 | loss scale: 16384.0 | grad norm: 9227.605 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1463/ 292968 | consumed samples: 2996224 | consumed tokens: 300728320 | elapsed time per iteration (ms): 103836.5 | learning rate: 7.990E-05 | global batch size: 2048 | lm loss: 4.188097E+00 | loss scale: 16384.0 | grad norm: 12528.586 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1464/ 292968 | consumed samples: 2998272 | consumed tokens: 301023232 | elapsed time per iteration (ms): 109032.4 | learning rate: 7.995E-05 | global batch size: 2048 | lm loss: 4.216120E+00 | loss scale: 16384.0 | grad norm: 12769.241 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1465/ 292968 | consumed samples: 3000320 | consumed tokens: 301318144 | elapsed time per iteration (ms): 104571.0 | learning rate: 8.001E-05 | global batch size: 2048 | lm loss: 4.192651E+00 | loss scale: 16384.0 | grad norm: 11561.615 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1466/ 292968 | consumed samples: 3002368 | consumed tokens: 301613056 | elapsed time per iteration (ms): 103381.4 | learning rate: 8.006E-05 | global batch size: 2048 | lm loss: 4.198256E+00 | loss scale: 16384.0 | grad norm: 9145.952 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1467/ 292968 | consumed samples: 3004416 | consumed tokens: 301907968 | elapsed time per iteration (ms): 106062.2 | learning rate: 8.012E-05 | global batch size: 2048 | lm loss: 4.216657E+00 | loss scale: 16384.0 | grad norm: 8140.649 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1468/ 292968 | consumed samples: 3006464 | consumed tokens: 302202880 | elapsed time per iteration (ms): 104292.7 | learning rate: 8.017E-05 | global batch size: 2048 | lm loss: 4.228948E+00 | loss scale: 16384.0 | grad norm: 8868.143 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1469/ 292968 | consumed samples: 3008512 | consumed tokens: 302497792 | elapsed time per iteration (ms): 109335.2 | learning rate: 8.023E-05 | global batch size: 2048 | lm loss: 4.176727E+00 | loss scale: 16384.0 | grad norm: 9614.273 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1470/ 292968 | consumed samples: 3010560 | consumed tokens: 302792704 | elapsed time per iteration (ms): 104543.7 | learning rate: 8.028E-05 | global batch size: 2048 | lm loss: 4.166099E+00 | loss scale: 16384.0 | grad norm: 10269.428 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1471/ 292968 | consumed samples: 3012608 | consumed tokens: 303087616 | elapsed time per iteration (ms): 105535.9 | learning rate: 8.034E-05 | global batch size: 2048 | lm loss: 4.207515E+00 | loss scale: 16384.0 | grad norm: 11332.054 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1472/ 292968 | consumed samples: 3014656 | consumed tokens: 303382528 | elapsed time per iteration (ms): 105432.4 | learning rate: 8.039E-05 | global batch size: 2048 | lm loss: 4.219175E+00 | loss scale: 16384.0 | grad norm: 10612.644 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1473/ 292968 | consumed samples: 3016704 | consumed tokens: 303677440 | elapsed time per iteration (ms): 108587.6 | learning rate: 8.045E-05 | global batch size: 2048 | lm loss: 4.212423E+00 | loss scale: 16384.0 | grad norm: 9364.871 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1474/ 292968 | consumed samples: 3018752 | consumed tokens: 303972352 | elapsed time per iteration (ms): 115903.3 | learning rate: 8.050E-05 | global batch size: 2048 | lm loss: 4.184131E+00 | loss scale: 16384.0 | grad norm: 9388.093 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1475/ 292968 | consumed samples: 3020800 | consumed tokens: 304267264 | elapsed time per iteration (ms): 111248.7 | learning rate: 8.055E-05 | global batch size: 2048 | lm loss: 4.197936E+00 | loss scale: 16384.0 | grad norm: 11204.270 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1476/ 292968 | consumed samples: 3022848 | consumed tokens: 304562176 | elapsed time per iteration (ms): 106196.1 | learning rate: 8.061E-05 | global batch size: 2048 | lm loss: 4.200994E+00 | loss scale: 16384.0 | grad norm: 12460.238 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1477/ 292968 | consumed samples: 3024896 | consumed tokens: 304857088 | elapsed time per iteration (ms): 115245.1 | learning rate: 8.066E-05 | global batch size: 2048 | lm loss: 4.185134E+00 | loss scale: 16384.0 | grad norm: 13631.835 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1478/ 292968 | consumed samples: 3026944 | consumed tokens: 305152000 | elapsed time per iteration (ms): 104367.4 | learning rate: 8.072E-05 | global batch size: 2048 | lm loss: 4.216756E+00 | loss scale: 16384.0 | grad norm: 12075.381 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1479/ 292968 | consumed samples: 3028992 | consumed tokens: 305446912 | elapsed time per iteration (ms): 106265.4 | learning rate: 8.077E-05 | global batch size: 2048 | lm loss: 4.171759E+00 | loss scale: 16384.0 | grad norm: 10980.912 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1480/ 292968 | consumed samples: 3031040 | consumed tokens: 305741824 | elapsed time per iteration (ms): 108805.3 | learning rate: 8.083E-05 | global batch size: 2048 | lm loss: 4.197142E+00 | loss scale: 16384.0 | grad norm: 11320.773 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1481/ 292968 | consumed samples: 3033088 | consumed tokens: 306036736 | elapsed time per iteration (ms): 105268.5 | learning rate: 8.088E-05 | global batch size: 2048 | lm loss: 4.194962E+00 | loss scale: 16384.0 | grad norm: 9121.136 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1482/ 292968 | consumed samples: 3035136 | consumed tokens: 306331648 | elapsed time per iteration (ms): 104524.1 | learning rate: 8.094E-05 | global batch size: 2048 | lm loss: 4.179837E+00 | loss scale: 16384.0 | grad norm: 8314.868 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1483/ 292968 | consumed samples: 3037184 | consumed tokens: 306626560 | elapsed time per iteration (ms): 105954.6 | learning rate: 8.099E-05 | global batch size: 2048 | lm loss: 4.156200E+00 | loss scale: 16384.0 | grad norm: 8117.374 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1484/ 292968 | consumed samples: 3039232 | consumed tokens: 306921472 | elapsed time per iteration (ms): 107617.6 | learning rate: 8.105E-05 | global batch size: 2048 | lm loss: 4.172272E+00 | loss scale: 16384.0 | grad norm: 7959.362 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1485/ 292968 | consumed samples: 3041280 | consumed tokens: 307216384 | elapsed time per iteration (ms): 110844.3 | learning rate: 8.110E-05 | global batch size: 2048 | lm loss: 4.182116E+00 | loss scale: 16384.0 | grad norm: 9225.480 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1486/ 292968 | consumed samples: 3043328 | consumed tokens: 307511296 | elapsed time per iteration (ms): 110773.6 | learning rate: 8.116E-05 | global batch size: 2048 | lm loss: 4.163764E+00 | loss scale: 16384.0 | grad norm: 11008.014 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1487/ 292968 | consumed samples: 3045376 | consumed tokens: 307806208 | elapsed time per iteration (ms): 110039.9 | learning rate: 8.121E-05 | global batch size: 2048 | lm loss: 4.195785E+00 | loss scale: 16384.0 | grad norm: 14146.053 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1488/ 292968 | consumed samples: 3047424 | consumed tokens: 308101120 | elapsed time per iteration (ms): 103981.1 | learning rate: 8.126E-05 | global batch size: 2048 | lm loss: 4.178236E+00 | loss scale: 16384.0 | grad norm: 12376.963 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1489/ 292968 | consumed samples: 3049472 | consumed tokens: 308396032 | elapsed time per iteration (ms): 103792.3 | learning rate: 8.132E-05 | global batch size: 2048 | lm loss: 4.184059E+00 | loss scale: 16384.0 | grad norm: 11650.633 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1490/ 292968 | consumed samples: 3051520 | consumed tokens: 308690944 | elapsed time per iteration (ms): 104164.7 | learning rate: 8.137E-05 | global batch size: 2048 | lm loss: 4.152838E+00 | loss scale: 16384.0 | grad norm: 10855.557 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1491/ 292968 | consumed samples: 3053568 | consumed tokens: 308985856 | elapsed time per iteration (ms): 104207.6 | learning rate: 8.143E-05 | global batch size: 2048 | lm loss: 4.177531E+00 | loss scale: 16384.0 | grad norm: 8624.545 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1492/ 292968 | consumed samples: 3055616 | consumed tokens: 309280768 | elapsed time per iteration (ms): 107790.0 | learning rate: 8.148E-05 | global batch size: 2048 | lm loss: 4.192419E+00 | loss scale: 16384.0 | grad norm: 11069.290 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1493/ 292968 | consumed samples: 3057664 | consumed tokens: 309575680 | elapsed time per iteration (ms): 103519.8 | learning rate: 8.154E-05 | global batch size: 2048 | lm loss: 4.171244E+00 | loss scale: 16384.0 | grad norm: 12777.128 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1494/ 292968 | consumed samples: 3059712 | consumed tokens: 309870592 | elapsed time per iteration (ms): 102492.4 | learning rate: 8.159E-05 | global batch size: 2048 | lm loss: 4.150554E+00 | loss scale: 16384.0 | grad norm: 13849.403 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1495/ 292968 | consumed samples: 3061760 | consumed tokens: 310165504 | elapsed time per iteration (ms): 104553.4 | learning rate: 8.165E-05 | global batch size: 2048 | lm loss: 4.210358E+00 | loss scale: 16384.0 | grad norm: 14309.631 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1496/ 292968 | consumed samples: 3063808 | consumed tokens: 310460416 | elapsed time per iteration (ms): 106258.9 | learning rate: 8.170E-05 | global batch size: 2048 | lm loss: 4.154563E+00 | loss scale: 16384.0 | grad norm: 10492.725 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1497/ 292968 | consumed samples: 3065856 | consumed tokens: 310755328 | elapsed time per iteration (ms): 107462.9 | learning rate: 8.176E-05 | global batch size: 2048 | lm loss: 4.204808E+00 | loss scale: 16384.0 | grad norm: 7189.207 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1498/ 292968 | consumed samples: 3067904 | consumed tokens: 311050240 | elapsed time per iteration (ms): 110209.2 | learning rate: 8.181E-05 | global batch size: 2048 | lm loss: 4.146855E+00 | loss scale: 16384.0 | grad norm: 8151.655 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1499/ 292968 | consumed samples: 3069952 | consumed tokens: 311345152 | elapsed time per iteration (ms): 115973.5 | learning rate: 8.187E-05 | global batch size: 2048 | lm loss: 4.215507E+00 | loss scale: 16384.0 | grad norm: 11096.944 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1500/ 292968 | consumed samples: 3072000 | consumed tokens: 311640064 | elapsed time per iteration (ms): 104877.4 | learning rate: 8.192E-05 | global batch size: 2048 | lm loss: 4.150307E+00 | loss scale: 32768.0 | grad norm: 14299.236 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 1500 | lm loss value: 4.159744E+00 | lm loss PPL: 6.405511E+01 | ------------------------------------------------------------------------------------------------- -saving checkpoint at iteration 1500 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-26 17:07:58,201] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/mp_rank_01_model_states.pt -[2021-10-26 17:07:58,526] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/mp_rank_00_model_states.pt -[2021-10-26 17:08:11,262] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-26 17:08:11,268] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-26 17:08:11,272] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-26 17:08:11,312] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-26 17:08:11,329] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-26 17:08:11,333] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-26 17:08:11,341] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-26 17:08:11,380] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-26 17:08:11,386] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-26 17:08:11,440] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-26 17:08:11,516] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-26 17:08:11,523] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-26 17:08:11,534] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-26 17:08:11,538] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-26 17:08:11,570] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-26 17:08:11,573] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-26 17:08:11,592] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-26 17:08:11,602] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-26 17:08:11,611] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-26 17:08:11,639] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-26 17:08:11,640] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-26 17:08:11,661] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-26 17:08:11,712] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-26 17:08:11,715] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-26 17:08:11,762] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-26 17:08:11,820] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-26 17:08:11,824] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-26 17:08:11,859] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-26 17:08:12,006] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-26 17:08:12,046] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-26 17:08:12,134] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-26 17:08:12,194] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-26 17:08:12,315] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-26 17:08:12,431] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-26 17:08:12,458] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-26 17:08:12,471] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-26 17:08:12,479] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-26 17:08:12,480] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-26 17:08:12,485] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-26 17:08:12,486] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-26 17:08:12,497] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-26 17:08:12,527] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-26 17:08:12,536] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-26 17:08:12,537] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-26 17:08:12,547] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-26 17:08:12,554] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-26 17:08:12,583] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-26 17:08:12,604] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-26 17:08:12,605] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-26 17:08:12,613] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-26 17:08:12,634] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-26 17:08:12,640] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-26 17:08:12,643] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-26 17:08:12,660] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-26 17:08:12,668] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-26 17:08:12,698] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-26 17:08:12,707] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-26 17:08:12,714] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-26 17:08:12,726] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-26 17:08:12,729] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-26 17:08:12,732] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-26 17:08:12,755] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-26 17:08:12,829] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-26 17:08:12,868] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-26 17:08:12,872] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-26 17:08:12,873] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-26 17:08:12,873] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-26 17:08:12,875] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-26 17:08:12,887] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-26 17:08:12,887] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-26 17:08:12,905] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-26 17:08:12,920] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-26 17:08:12,928] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-26 17:08:12,936] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-26 17:08:12,958] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-26 17:08:12,971] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-26 17:08:12,991] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-26 17:08:13,034] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-26 17:08:13,036] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-26 17:08:13,041] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-26 17:08:13,046] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-26 17:08:13,049] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-26 17:08:13,064] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-26 17:08:13,066] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-26 17:08:13,099] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-26 17:08:13,101] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-26 17:08:13,156] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-26 17:08:13,167] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-26 17:08:13,190] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-26 17:08:13,193] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-26 17:08:13,210] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-26 17:08:13,252] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-26 17:08:13,291] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-26 17:08:13,309] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-26 17:08:13,353] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-26 17:08:13,384] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-26 17:08:13,440] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-26 17:08:13,465] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-26 17:08:13,493] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-26 17:08:13,497] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-26 17:08:13,502] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-26 17:08:13,530] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-26 17:08:13,610] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-26 17:08:13,659] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-26 17:08:13,700] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-26 17:08:13,731] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-26 17:08:13,782] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-26 17:08:13,808] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-26 17:08:13,815] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-26 17:08:13,855] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-26 17:08:13,907] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-26 17:08:13,961] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-26 17:08:14,190] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-26 17:08:14,195] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-26 17:08:14,405] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-26 17:08:14,533] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-26 17:08:15,073] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-26 17:08:15,527] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-26 17:08:15,587] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-26 17:08:15,706] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-26 17:08:15,793] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-26 17:08:17,391] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-26 17:08:17,493] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-26 17:08:18,443] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-26 17:08:18,621] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-26 17:08:19,148] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-26 17:08:19,898] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-26 17:08:21,024] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1500/zero_pp_rank_0_mp_rank_28_optim_states.pt - successfully saved checkpoint at iteration 1500 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 25583.50 - iteration 1501/ 292968 | consumed samples: 3074048 | consumed tokens: 311934976 | elapsed time per iteration (ms): 294667.3 | learning rate: 8.197E-05 | global batch size: 2048 | lm loss: 4.161125E+00 | loss scale: 32768.0 | grad norm: 19606.729 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1502/ 292968 | consumed samples: 3076096 | consumed tokens: 312229888 | elapsed time per iteration (ms): 107646.5 | learning rate: 8.203E-05 | global batch size: 2048 | lm loss: 4.177512E+00 | loss scale: 32768.0 | grad norm: 44439.187 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1503/ 292968 | consumed samples: 3078144 | consumed tokens: 312524800 | elapsed time per iteration (ms): 102982.3 | learning rate: 8.208E-05 | global batch size: 2048 | lm loss: 4.207975E+00 | loss scale: 32768.0 | grad norm: 29502.062 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1504/ 292968 | consumed samples: 3080192 | consumed tokens: 312819712 | elapsed time per iteration (ms): 105144.0 | learning rate: 8.214E-05 | global batch size: 2048 | lm loss: 4.155536E+00 | loss scale: 32768.0 | grad norm: 29885.781 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1505/ 292968 | consumed samples: 3082240 | consumed tokens: 313114624 | elapsed time per iteration (ms): 103411.3 | learning rate: 8.219E-05 | global batch size: 2048 | lm loss: 4.185308E+00 | loss scale: 32768.0 | grad norm: 31440.180 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1506/ 292968 | consumed samples: 3084288 | consumed tokens: 313409536 | elapsed time per iteration (ms): 105509.9 | learning rate: 8.225E-05 | global batch size: 2048 | lm loss: 4.192742E+00 | loss scale: 32768.0 | grad norm: 26052.013 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1507/ 292968 | consumed samples: 3086336 | consumed tokens: 313704448 | elapsed time per iteration (ms): 104918.9 | learning rate: 8.230E-05 | global batch size: 2048 | lm loss: 4.182647E+00 | loss scale: 32768.0 | grad norm: 18335.262 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1508/ 292968 | consumed samples: 3088384 | consumed tokens: 313999360 | elapsed time per iteration (ms): 103983.3 | learning rate: 8.236E-05 | global batch size: 2048 | lm loss: 4.183975E+00 | loss scale: 32768.0 | grad norm: 22227.229 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1509/ 292968 | consumed samples: 3090432 | consumed tokens: 314294272 | elapsed time per iteration (ms): 103025.3 | learning rate: 8.241E-05 | global batch size: 2048 | lm loss: 4.186974E+00 | loss scale: 32768.0 | grad norm: 17171.986 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1510/ 292968 | consumed samples: 3092480 | consumed tokens: 314589184 | elapsed time per iteration (ms): 108791.9 | learning rate: 8.247E-05 | global batch size: 2048 | lm loss: 4.195477E+00 | loss scale: 32768.0 | grad norm: 17392.659 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1511/ 292968 | consumed samples: 3094528 | consumed tokens: 314884096 | elapsed time per iteration (ms): 110895.9 | learning rate: 8.252E-05 | global batch size: 2048 | lm loss: 4.162581E+00 | loss scale: 32768.0 | grad norm: 18393.810 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1512/ 292968 | consumed samples: 3096576 | consumed tokens: 315179008 | elapsed time per iteration (ms): 104511.7 | learning rate: 8.258E-05 | global batch size: 2048 | lm loss: 4.168368E+00 | loss scale: 32768.0 | grad norm: 18365.563 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1513/ 292968 | consumed samples: 3098624 | consumed tokens: 315473920 | elapsed time per iteration (ms): 109312.5 | learning rate: 8.263E-05 | global batch size: 2048 | lm loss: 4.161445E+00 | loss scale: 32768.0 | grad norm: 18148.313 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1514/ 292968 | consumed samples: 3100672 | consumed tokens: 315768832 | elapsed time per iteration (ms): 110136.6 | learning rate: 8.268E-05 | global batch size: 2048 | lm loss: 4.158703E+00 | loss scale: 32768.0 | grad norm: 15693.248 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1515/ 292968 | consumed samples: 3102720 | consumed tokens: 316063744 | elapsed time per iteration (ms): 104991.9 | learning rate: 8.274E-05 | global batch size: 2048 | lm loss: 4.136301E+00 | loss scale: 32768.0 | grad norm: 21618.989 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1516/ 292968 | consumed samples: 3104768 | consumed tokens: 316358656 | elapsed time per iteration (ms): 105781.7 | learning rate: 8.279E-05 | global batch size: 2048 | lm loss: 4.168713E+00 | loss scale: 32768.0 | grad norm: 17414.041 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1517/ 292968 | consumed samples: 3106816 | consumed tokens: 316653568 | elapsed time per iteration (ms): 106179.6 | learning rate: 8.285E-05 | global batch size: 2048 | lm loss: 4.181880E+00 | loss scale: 32768.0 | grad norm: 14356.046 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1518/ 292968 | consumed samples: 3108864 | consumed tokens: 316948480 | elapsed time per iteration (ms): 105619.1 | learning rate: 8.290E-05 | global batch size: 2048 | lm loss: 4.188911E+00 | loss scale: 32768.0 | grad norm: 16226.121 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1519/ 292968 | consumed samples: 3110912 | consumed tokens: 317243392 | elapsed time per iteration (ms): 105487.3 | learning rate: 8.296E-05 | global batch size: 2048 | lm loss: 4.119454E+00 | loss scale: 32768.0 | grad norm: 20715.539 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1520/ 292968 | consumed samples: 3112960 | consumed tokens: 317538304 | elapsed time per iteration (ms): 107006.6 | learning rate: 8.301E-05 | global batch size: 2048 | lm loss: 4.193812E+00 | loss scale: 32768.0 | grad norm: 23717.421 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1521/ 292968 | consumed samples: 3115008 | consumed tokens: 317833216 | elapsed time per iteration (ms): 104883.4 | learning rate: 8.307E-05 | global batch size: 2048 | lm loss: 4.175305E+00 | loss scale: 32768.0 | grad norm: 22627.236 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1522/ 292968 | consumed samples: 3117056 | consumed tokens: 318128128 | elapsed time per iteration (ms): 108010.3 | learning rate: 8.312E-05 | global batch size: 2048 | lm loss: 4.146116E+00 | loss scale: 32768.0 | grad norm: 21298.049 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1523/ 292968 | consumed samples: 3119104 | consumed tokens: 318423040 | elapsed time per iteration (ms): 106400.2 | learning rate: 8.318E-05 | global batch size: 2048 | lm loss: 4.167277E+00 | loss scale: 32768.0 | grad norm: 14984.326 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1524/ 292968 | consumed samples: 3121152 | consumed tokens: 318717952 | elapsed time per iteration (ms): 105985.0 | learning rate: 8.323E-05 | global batch size: 2048 | lm loss: 4.166503E+00 | loss scale: 32768.0 | grad norm: 15653.955 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1525/ 292968 | consumed samples: 3123200 | consumed tokens: 319012864 | elapsed time per iteration (ms): 107617.3 | learning rate: 8.329E-05 | global batch size: 2048 | lm loss: 4.165236E+00 | loss scale: 32768.0 | grad norm: 15462.584 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1526/ 292968 | consumed samples: 3125248 | consumed tokens: 319307776 | elapsed time per iteration (ms): 104251.4 | learning rate: 8.334E-05 | global batch size: 2048 | lm loss: 4.140454E+00 | loss scale: 32768.0 | grad norm: 20513.774 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1527/ 292968 | consumed samples: 3127296 | consumed tokens: 319602688 | elapsed time per iteration (ms): 105436.2 | learning rate: 8.339E-05 | global batch size: 2048 | lm loss: 4.146454E+00 | loss scale: 32768.0 | grad norm: 23593.061 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1528/ 292968 | consumed samples: 3129344 | consumed tokens: 319897600 | elapsed time per iteration (ms): 103251.1 | learning rate: 8.345E-05 | global batch size: 2048 | lm loss: 4.158430E+00 | loss scale: 32768.0 | grad norm: 23424.606 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1529/ 292968 | consumed samples: 3131392 | consumed tokens: 320192512 | elapsed time per iteration (ms): 106216.5 | learning rate: 8.350E-05 | global batch size: 2048 | lm loss: 4.165556E+00 | loss scale: 32768.0 | grad norm: 19956.278 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1530/ 292968 | consumed samples: 3133440 | consumed tokens: 320487424 | elapsed time per iteration (ms): 105417.7 | learning rate: 8.356E-05 | global batch size: 2048 | lm loss: 4.153375E+00 | loss scale: 32768.0 | grad norm: 25273.694 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1531/ 292968 | consumed samples: 3135488 | consumed tokens: 320782336 | elapsed time per iteration (ms): 104399.3 | learning rate: 8.361E-05 | global batch size: 2048 | lm loss: 4.170780E+00 | loss scale: 32768.0 | grad norm: 24832.897 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1532/ 292968 | consumed samples: 3137536 | consumed tokens: 321077248 | elapsed time per iteration (ms): 103835.0 | learning rate: 8.367E-05 | global batch size: 2048 | lm loss: 4.154833E+00 | loss scale: 32768.0 | grad norm: 19935.943 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1533/ 292968 | consumed samples: 3139584 | consumed tokens: 321372160 | elapsed time per iteration (ms): 105482.5 | learning rate: 8.372E-05 | global batch size: 2048 | lm loss: 4.143008E+00 | loss scale: 32768.0 | grad norm: 20317.584 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1534/ 292968 | consumed samples: 3141632 | consumed tokens: 321667072 | elapsed time per iteration (ms): 104597.6 | learning rate: 8.378E-05 | global batch size: 2048 | lm loss: 4.167706E+00 | loss scale: 32768.0 | grad norm: 19625.439 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1535/ 292968 | consumed samples: 3143680 | consumed tokens: 321961984 | elapsed time per iteration (ms): 104810.1 | learning rate: 8.383E-05 | global batch size: 2048 | lm loss: 4.140921E+00 | loss scale: 32768.0 | grad norm: 16922.271 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1536/ 292968 | consumed samples: 3145728 | consumed tokens: 322256896 | elapsed time per iteration (ms): 106183.5 | learning rate: 8.389E-05 | global batch size: 2048 | lm loss: 4.160961E+00 | loss scale: 32768.0 | grad norm: 18999.065 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1537/ 292968 | consumed samples: 3147776 | consumed tokens: 322551808 | elapsed time per iteration (ms): 104071.9 | learning rate: 8.394E-05 | global batch size: 2048 | lm loss: 4.165040E+00 | loss scale: 32768.0 | grad norm: 21212.839 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1538/ 292968 | consumed samples: 3149824 | consumed tokens: 322846720 | elapsed time per iteration (ms): 105801.5 | learning rate: 8.400E-05 | global batch size: 2048 | lm loss: 4.143928E+00 | loss scale: 32768.0 | grad norm: 19399.994 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1539/ 292968 | consumed samples: 3151872 | consumed tokens: 323141632 | elapsed time per iteration (ms): 105762.2 | learning rate: 8.405E-05 | global batch size: 2048 | lm loss: 4.145596E+00 | loss scale: 32768.0 | grad norm: 16444.079 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1540/ 292968 | consumed samples: 3153920 | consumed tokens: 323436544 | elapsed time per iteration (ms): 105230.0 | learning rate: 8.410E-05 | global batch size: 2048 | lm loss: 4.182285E+00 | loss scale: 32768.0 | grad norm: 18645.171 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1541/ 292968 | consumed samples: 3155968 | consumed tokens: 323731456 | elapsed time per iteration (ms): 104359.5 | learning rate: 8.416E-05 | global batch size: 2048 | lm loss: 4.153680E+00 | loss scale: 32768.0 | grad norm: 19144.070 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1542/ 292968 | consumed samples: 3158016 | consumed tokens: 324026368 | elapsed time per iteration (ms): 103247.0 | learning rate: 8.421E-05 | global batch size: 2048 | lm loss: 4.141051E+00 | loss scale: 32768.0 | grad norm: 22728.673 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1543/ 292968 | consumed samples: 3160064 | consumed tokens: 324321280 | elapsed time per iteration (ms): 105587.2 | learning rate: 8.427E-05 | global batch size: 2048 | lm loss: 4.162117E+00 | loss scale: 32768.0 | grad norm: 22320.995 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1544/ 292968 | consumed samples: 3162112 | consumed tokens: 324616192 | elapsed time per iteration (ms): 107583.5 | learning rate: 8.432E-05 | global batch size: 2048 | lm loss: 4.118957E+00 | loss scale: 32768.0 | grad norm: 18585.285 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1545/ 292968 | consumed samples: 3164160 | consumed tokens: 324911104 | elapsed time per iteration (ms): 103533.0 | learning rate: 8.438E-05 | global batch size: 2048 | lm loss: 4.194981E+00 | loss scale: 32768.0 | grad norm: 17424.365 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1546/ 292968 | consumed samples: 3166208 | consumed tokens: 325206016 | elapsed time per iteration (ms): 105785.8 | learning rate: 8.443E-05 | global batch size: 2048 | lm loss: 4.172066E+00 | loss scale: 32768.0 | grad norm: 14657.355 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1547/ 292968 | consumed samples: 3168256 | consumed tokens: 325500928 | elapsed time per iteration (ms): 103440.1 | learning rate: 8.449E-05 | global batch size: 2048 | lm loss: 4.149372E+00 | loss scale: 32768.0 | grad norm: 20054.615 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1548/ 292968 | consumed samples: 3170304 | consumed tokens: 325795840 | elapsed time per iteration (ms): 103262.2 | learning rate: 8.454E-05 | global batch size: 2048 | lm loss: 4.142512E+00 | loss scale: 32768.0 | grad norm: 26019.156 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1549/ 292968 | consumed samples: 3172352 | consumed tokens: 326090752 | elapsed time per iteration (ms): 104600.2 | learning rate: 8.460E-05 | global batch size: 2048 | lm loss: 4.132460E+00 | loss scale: 32768.0 | grad norm: 26529.073 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1550/ 292968 | consumed samples: 3174400 | consumed tokens: 326385664 | elapsed time per iteration (ms): 105072.8 | learning rate: 8.465E-05 | global batch size: 2048 | lm loss: 4.136762E+00 | loss scale: 32768.0 | grad norm: 21722.158 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1551/ 292968 | consumed samples: 3176448 | consumed tokens: 326680576 | elapsed time per iteration (ms): 104433.0 | learning rate: 8.471E-05 | global batch size: 2048 | lm loss: 4.147036E+00 | loss scale: 32768.0 | grad norm: 18804.830 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1552/ 292968 | consumed samples: 3178496 | consumed tokens: 326975488 | elapsed time per iteration (ms): 104907.4 | learning rate: 8.476E-05 | global batch size: 2048 | lm loss: 4.139750E+00 | loss scale: 32768.0 | grad norm: 17089.094 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1553/ 292968 | consumed samples: 3180544 | consumed tokens: 327270400 | elapsed time per iteration (ms): 104628.3 | learning rate: 8.481E-05 | global batch size: 2048 | lm loss: 4.148928E+00 | loss scale: 32768.0 | grad norm: 21712.401 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1554/ 292968 | consumed samples: 3182592 | consumed tokens: 327565312 | elapsed time per iteration (ms): 104439.0 | learning rate: 8.487E-05 | global batch size: 2048 | lm loss: 4.136716E+00 | loss scale: 32768.0 | grad norm: 23112.337 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1555/ 292968 | consumed samples: 3184640 | consumed tokens: 327860224 | elapsed time per iteration (ms): 104000.7 | learning rate: 8.492E-05 | global batch size: 2048 | lm loss: 4.155643E+00 | loss scale: 32768.0 | grad norm: 19676.444 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1556/ 292968 | consumed samples: 3186688 | consumed tokens: 328155136 | elapsed time per iteration (ms): 108353.6 | learning rate: 8.498E-05 | global batch size: 2048 | lm loss: 4.117136E+00 | loss scale: 32768.0 | grad norm: 15672.471 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1557/ 292968 | consumed samples: 3188736 | consumed tokens: 328450048 | elapsed time per iteration (ms): 104098.3 | learning rate: 8.503E-05 | global batch size: 2048 | lm loss: 4.134876E+00 | loss scale: 32768.0 | grad norm: 17258.723 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1558/ 292968 | consumed samples: 3190784 | consumed tokens: 328744960 | elapsed time per iteration (ms): 104701.7 | learning rate: 8.509E-05 | global batch size: 2048 | lm loss: 4.137351E+00 | loss scale: 32768.0 | grad norm: 18650.497 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1559/ 292968 | consumed samples: 3192832 | consumed tokens: 329039872 | elapsed time per iteration (ms): 103726.2 | learning rate: 8.514E-05 | global batch size: 2048 | lm loss: 4.152483E+00 | loss scale: 32768.0 | grad norm: 24707.004 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1560/ 292968 | consumed samples: 3194880 | consumed tokens: 329334784 | elapsed time per iteration (ms): 104093.1 | learning rate: 8.520E-05 | global batch size: 2048 | lm loss: 4.140297E+00 | loss scale: 32768.0 | grad norm: 30527.425 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1561/ 292968 | consumed samples: 3196928 | consumed tokens: 329629696 | elapsed time per iteration (ms): 103290.0 | learning rate: 8.525E-05 | global batch size: 2048 | lm loss: 4.128441E+00 | loss scale: 32768.0 | grad norm: 22949.441 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1562/ 292968 | consumed samples: 3198976 | consumed tokens: 329924608 | elapsed time per iteration (ms): 104734.1 | learning rate: 8.531E-05 | global batch size: 2048 | lm loss: 4.142885E+00 | loss scale: 32768.0 | grad norm: 15850.599 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1563/ 292968 | consumed samples: 3201024 | consumed tokens: 330219520 | elapsed time per iteration (ms): 103514.5 | learning rate: 8.536E-05 | global batch size: 2048 | lm loss: 4.130913E+00 | loss scale: 32768.0 | grad norm: 14941.324 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1564/ 292968 | consumed samples: 3203072 | consumed tokens: 330514432 | elapsed time per iteration (ms): 105767.6 | learning rate: 8.542E-05 | global batch size: 2048 | lm loss: 4.127303E+00 | loss scale: 32768.0 | grad norm: 17454.689 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1565/ 292968 | consumed samples: 3205120 | consumed tokens: 330809344 | elapsed time per iteration (ms): 104173.6 | learning rate: 8.547E-05 | global batch size: 2048 | lm loss: 4.135751E+00 | loss scale: 32768.0 | grad norm: 15428.579 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1566/ 292968 | consumed samples: 3207168 | consumed tokens: 331104256 | elapsed time per iteration (ms): 104972.2 | learning rate: 8.552E-05 | global batch size: 2048 | lm loss: 4.115630E+00 | loss scale: 32768.0 | grad norm: 13137.476 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1567/ 292968 | consumed samples: 3209216 | consumed tokens: 331399168 | elapsed time per iteration (ms): 104818.2 | learning rate: 8.558E-05 | global batch size: 2048 | lm loss: 4.168973E+00 | loss scale: 32768.0 | grad norm: 13335.591 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1568/ 292968 | consumed samples: 3211264 | consumed tokens: 331694080 | elapsed time per iteration (ms): 103308.2 | learning rate: 8.563E-05 | global batch size: 2048 | lm loss: 4.127815E+00 | loss scale: 32768.0 | grad norm: 14958.767 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1569/ 292968 | consumed samples: 3213312 | consumed tokens: 331988992 | elapsed time per iteration (ms): 105630.6 | learning rate: 8.569E-05 | global batch size: 2048 | lm loss: 4.148279E+00 | loss scale: 32768.0 | grad norm: 16201.550 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1570/ 292968 | consumed samples: 3215360 | consumed tokens: 332283904 | elapsed time per iteration (ms): 105291.3 | learning rate: 8.574E-05 | global batch size: 2048 | lm loss: 4.138139E+00 | loss scale: 32768.0 | grad norm: 20636.446 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1571/ 292968 | consumed samples: 3217408 | consumed tokens: 332578816 | elapsed time per iteration (ms): 104661.8 | learning rate: 8.580E-05 | global batch size: 2048 | lm loss: 4.120522E+00 | loss scale: 32768.0 | grad norm: 23572.463 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1572/ 292968 | consumed samples: 3219456 | consumed tokens: 332873728 | elapsed time per iteration (ms): 105890.1 | learning rate: 8.585E-05 | global batch size: 2048 | lm loss: 4.116461E+00 | loss scale: 32768.0 | grad norm: 20069.106 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1573/ 292968 | consumed samples: 3221504 | consumed tokens: 333168640 | elapsed time per iteration (ms): 104943.4 | learning rate: 8.591E-05 | global batch size: 2048 | lm loss: 4.143171E+00 | loss scale: 32768.0 | grad norm: 18737.961 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1574/ 292968 | consumed samples: 3223552 | consumed tokens: 333463552 | elapsed time per iteration (ms): 104640.4 | learning rate: 8.596E-05 | global batch size: 2048 | lm loss: 4.139335E+00 | loss scale: 32768.0 | grad norm: 20283.590 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1575/ 292968 | consumed samples: 3225600 | consumed tokens: 333758464 | elapsed time per iteration (ms): 105883.6 | learning rate: 8.602E-05 | global batch size: 2048 | lm loss: 4.154176E+00 | loss scale: 32768.0 | grad norm: 18810.405 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1576/ 292968 | consumed samples: 3227648 | consumed tokens: 334053376 | elapsed time per iteration (ms): 102945.3 | learning rate: 8.607E-05 | global batch size: 2048 | lm loss: 4.128248E+00 | loss scale: 32768.0 | grad norm: 23969.397 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1577/ 292968 | consumed samples: 3229696 | consumed tokens: 334348288 | elapsed time per iteration (ms): 104000.9 | learning rate: 8.613E-05 | global batch size: 2048 | lm loss: 4.155667E+00 | loss scale: 32768.0 | grad norm: 27843.447 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1578/ 292968 | consumed samples: 3231744 | consumed tokens: 334643200 | elapsed time per iteration (ms): 103946.3 | learning rate: 8.618E-05 | global batch size: 2048 | lm loss: 4.132092E+00 | loss scale: 32768.0 | grad norm: 18685.435 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1579/ 292968 | consumed samples: 3233792 | consumed tokens: 334938112 | elapsed time per iteration (ms): 105038.1 | learning rate: 8.623E-05 | global batch size: 2048 | lm loss: 4.124686E+00 | loss scale: 32768.0 | grad norm: 19963.193 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1580/ 292968 | consumed samples: 3235840 | consumed tokens: 335233024 | elapsed time per iteration (ms): 103374.8 | learning rate: 8.629E-05 | global batch size: 2048 | lm loss: 4.146832E+00 | loss scale: 32768.0 | grad norm: 23238.226 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1581/ 292968 | consumed samples: 3237888 | consumed tokens: 335527936 | elapsed time per iteration (ms): 104338.4 | learning rate: 8.634E-05 | global batch size: 2048 | lm loss: 4.144770E+00 | loss scale: 32768.0 | grad norm: 21792.914 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1582/ 292968 | consumed samples: 3239936 | consumed tokens: 335822848 | elapsed time per iteration (ms): 104885.0 | learning rate: 8.640E-05 | global batch size: 2048 | lm loss: 4.127496E+00 | loss scale: 32768.0 | grad norm: 25836.004 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1583/ 292968 | consumed samples: 3241984 | consumed tokens: 336117760 | elapsed time per iteration (ms): 103893.8 | learning rate: 8.645E-05 | global batch size: 2048 | lm loss: 4.166101E+00 | loss scale: 32768.0 | grad norm: 21336.296 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1584/ 292968 | consumed samples: 3244032 | consumed tokens: 336412672 | elapsed time per iteration (ms): 104078.1 | learning rate: 8.651E-05 | global batch size: 2048 | lm loss: 4.117161E+00 | loss scale: 32768.0 | grad norm: 14350.832 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1585/ 292968 | consumed samples: 3246080 | consumed tokens: 336707584 | elapsed time per iteration (ms): 105402.9 | learning rate: 8.656E-05 | global batch size: 2048 | lm loss: 4.146427E+00 | loss scale: 32768.0 | grad norm: 12478.064 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1586/ 292968 | consumed samples: 3248128 | consumed tokens: 337002496 | elapsed time per iteration (ms): 104507.9 | learning rate: 8.662E-05 | global batch size: 2048 | lm loss: 4.126790E+00 | loss scale: 32768.0 | grad norm: 12207.322 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1587/ 292968 | consumed samples: 3250176 | consumed tokens: 337297408 | elapsed time per iteration (ms): 101633.3 | learning rate: 8.667E-05 | global batch size: 2048 | lm loss: 4.105484E+00 | loss scale: 32768.0 | grad norm: 14376.602 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1588/ 292968 | consumed samples: 3252224 | consumed tokens: 337592320 | elapsed time per iteration (ms): 104404.9 | learning rate: 8.673E-05 | global batch size: 2048 | lm loss: 4.124932E+00 | loss scale: 32768.0 | grad norm: 16281.445 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1589/ 292968 | consumed samples: 3254272 | consumed tokens: 337887232 | elapsed time per iteration (ms): 107568.4 | learning rate: 8.678E-05 | global batch size: 2048 | lm loss: 4.118083E+00 | loss scale: 32768.0 | grad norm: 19120.127 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1590/ 292968 | consumed samples: 3256320 | consumed tokens: 338182144 | elapsed time per iteration (ms): 104366.4 | learning rate: 8.684E-05 | global batch size: 2048 | lm loss: 4.129394E+00 | loss scale: 32768.0 | grad norm: 20415.166 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1591/ 292968 | consumed samples: 3258368 | consumed tokens: 338477056 | elapsed time per iteration (ms): 103644.8 | learning rate: 8.689E-05 | global batch size: 2048 | lm loss: 4.127513E+00 | loss scale: 32768.0 | grad norm: 19338.000 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1592/ 292968 | consumed samples: 3260416 | consumed tokens: 338771968 | elapsed time per iteration (ms): 103421.5 | learning rate: 8.694E-05 | global batch size: 2048 | lm loss: 4.130140E+00 | loss scale: 32768.0 | grad norm: 19741.003 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1593/ 292968 | consumed samples: 3262464 | consumed tokens: 339066880 | elapsed time per iteration (ms): 106619.1 | learning rate: 8.700E-05 | global batch size: 2048 | lm loss: 4.143212E+00 | loss scale: 32768.0 | grad norm: 24142.122 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1594/ 292968 | consumed samples: 3264512 | consumed tokens: 339361792 | elapsed time per iteration (ms): 99783.4 | learning rate: 8.705E-05 | global batch size: 2048 | lm loss: 4.132574E+00 | loss scale: 32768.0 | grad norm: 25321.581 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1595/ 292968 | consumed samples: 3266560 | consumed tokens: 339656704 | elapsed time per iteration (ms): 104645.5 | learning rate: 8.711E-05 | global batch size: 2048 | lm loss: 4.115793E+00 | loss scale: 32768.0 | grad norm: 25213.682 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1596/ 292968 | consumed samples: 3268608 | consumed tokens: 339951616 | elapsed time per iteration (ms): 104135.1 | learning rate: 8.716E-05 | global batch size: 2048 | lm loss: 4.125645E+00 | loss scale: 32768.0 | grad norm: 24668.893 | num zeros: 0.0 | curriculum seqlen: 144 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1597/ 292968 | consumed samples: 3270656 | consumed tokens: 340262912 | elapsed time per iteration (ms): 103628.4 | learning rate: 8.722E-05 | global batch size: 2048 | lm loss: 4.166674E+00 | loss scale: 32768.0 | grad norm: 23702.821 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1598/ 292968 | consumed samples: 3272704 | consumed tokens: 340574208 | elapsed time per iteration (ms): 105784.9 | learning rate: 8.727E-05 | global batch size: 2048 | lm loss: 4.179239E+00 | loss scale: 32768.0 | grad norm: 23353.468 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1599/ 292968 | consumed samples: 3274752 | consumed tokens: 340885504 | elapsed time per iteration (ms): 104574.9 | learning rate: 8.733E-05 | global batch size: 2048 | lm loss: 4.143254E+00 | loss scale: 32768.0 | grad norm: 22067.607 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1600/ 292968 | consumed samples: 3276800 | consumed tokens: 341196800 | elapsed time per iteration (ms): 103152.9 | learning rate: 8.738E-05 | global batch size: 2048 | lm loss: 4.112158E+00 | loss scale: 32768.0 | grad norm: 23742.094 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1601/ 292968 | consumed samples: 3278848 | consumed tokens: 341508096 | elapsed time per iteration (ms): 105110.2 | learning rate: 8.744E-05 | global batch size: 2048 | lm loss: 4.167132E+00 | loss scale: 32768.0 | grad norm: 32077.868 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1602/ 292968 | consumed samples: 3280896 | consumed tokens: 341819392 | elapsed time per iteration (ms): 103163.5 | learning rate: 8.749E-05 | global batch size: 2048 | lm loss: 4.151443E+00 | loss scale: 32768.0 | grad norm: 21285.195 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1603/ 292968 | consumed samples: 3282944 | consumed tokens: 342130688 | elapsed time per iteration (ms): 105828.6 | learning rate: 8.755E-05 | global batch size: 2048 | lm loss: 4.163060E+00 | loss scale: 32768.0 | grad norm: 23736.558 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1604/ 292968 | consumed samples: 3284992 | consumed tokens: 342441984 | elapsed time per iteration (ms): 104601.4 | learning rate: 8.760E-05 | global batch size: 2048 | lm loss: 4.146809E+00 | loss scale: 32768.0 | grad norm: 26923.892 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1605/ 292968 | consumed samples: 3287040 | consumed tokens: 342753280 | elapsed time per iteration (ms): 104140.2 | learning rate: 8.765E-05 | global batch size: 2048 | lm loss: 4.148554E+00 | loss scale: 32768.0 | grad norm: 22516.344 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1606/ 292968 | consumed samples: 3289088 | consumed tokens: 343064576 | elapsed time per iteration (ms): 102793.0 | learning rate: 8.771E-05 | global batch size: 2048 | lm loss: 4.137195E+00 | loss scale: 32768.0 | grad norm: 23462.303 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1607/ 292968 | consumed samples: 3291136 | consumed tokens: 343375872 | elapsed time per iteration (ms): 105843.5 | learning rate: 8.776E-05 | global batch size: 2048 | lm loss: 4.115441E+00 | loss scale: 32768.0 | grad norm: 20312.683 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1608/ 292968 | consumed samples: 3293184 | consumed tokens: 343687168 | elapsed time per iteration (ms): 105027.3 | learning rate: 8.782E-05 | global batch size: 2048 | lm loss: 4.131564E+00 | loss scale: 32768.0 | grad norm: 19407.537 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1609/ 292968 | consumed samples: 3295232 | consumed tokens: 343998464 | elapsed time per iteration (ms): 104339.0 | learning rate: 8.787E-05 | global batch size: 2048 | lm loss: 4.128519E+00 | loss scale: 32768.0 | grad norm: 21459.607 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1610/ 292968 | consumed samples: 3297280 | consumed tokens: 344309760 | elapsed time per iteration (ms): 105666.7 | learning rate: 8.793E-05 | global batch size: 2048 | lm loss: 4.106834E+00 | loss scale: 32768.0 | grad norm: 19434.461 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1611/ 292968 | consumed samples: 3299328 | consumed tokens: 344621056 | elapsed time per iteration (ms): 103938.2 | learning rate: 8.798E-05 | global batch size: 2048 | lm loss: 4.097841E+00 | loss scale: 32768.0 | grad norm: 17632.017 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1612/ 292968 | consumed samples: 3301376 | consumed tokens: 344932352 | elapsed time per iteration (ms): 107290.6 | learning rate: 8.804E-05 | global batch size: 2048 | lm loss: 4.120338E+00 | loss scale: 32768.0 | grad norm: 21648.945 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1613/ 292968 | consumed samples: 3303424 | consumed tokens: 345243648 | elapsed time per iteration (ms): 103846.2 | learning rate: 8.809E-05 | global batch size: 2048 | lm loss: 4.122810E+00 | loss scale: 32768.0 | grad norm: 27419.690 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1614/ 292968 | consumed samples: 3305472 | consumed tokens: 345554944 | elapsed time per iteration (ms): 104046.0 | learning rate: 8.815E-05 | global batch size: 2048 | lm loss: 4.092690E+00 | loss scale: 32768.0 | grad norm: 30448.721 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1615/ 292968 | consumed samples: 3307520 | consumed tokens: 345866240 | elapsed time per iteration (ms): 103724.6 | learning rate: 8.820E-05 | global batch size: 2048 | lm loss: 4.110240E+00 | loss scale: 32768.0 | grad norm: 24857.482 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1616/ 292968 | consumed samples: 3309568 | consumed tokens: 346177536 | elapsed time per iteration (ms): 103766.8 | learning rate: 8.826E-05 | global batch size: 2048 | lm loss: 4.102888E+00 | loss scale: 32768.0 | grad norm: 21184.201 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1617/ 292968 | consumed samples: 3311616 | consumed tokens: 346488832 | elapsed time per iteration (ms): 105762.3 | learning rate: 8.831E-05 | global batch size: 2048 | lm loss: 4.124961E+00 | loss scale: 32768.0 | grad norm: 16497.796 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1618/ 292968 | consumed samples: 3313664 | consumed tokens: 346800128 | elapsed time per iteration (ms): 103324.8 | learning rate: 8.836E-05 | global batch size: 2048 | lm loss: 4.116298E+00 | loss scale: 32768.0 | grad norm: 17602.537 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1619/ 292968 | consumed samples: 3315712 | consumed tokens: 347111424 | elapsed time per iteration (ms): 105596.0 | learning rate: 8.842E-05 | global batch size: 2048 | lm loss: 4.101456E+00 | loss scale: 32768.0 | grad norm: 17671.238 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1620/ 292968 | consumed samples: 3317760 | consumed tokens: 347422720 | elapsed time per iteration (ms): 104949.7 | learning rate: 8.847E-05 | global batch size: 2048 | lm loss: 4.070568E+00 | loss scale: 32768.0 | grad norm: 11812.124 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1621/ 292968 | consumed samples: 3319808 | consumed tokens: 347734016 | elapsed time per iteration (ms): 105413.7 | learning rate: 8.853E-05 | global batch size: 2048 | lm loss: 4.093331E+00 | loss scale: 32768.0 | grad norm: 13240.803 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1622/ 292968 | consumed samples: 3321856 | consumed tokens: 348045312 | elapsed time per iteration (ms): 104234.0 | learning rate: 8.858E-05 | global batch size: 2048 | lm loss: 4.084456E+00 | loss scale: 32768.0 | grad norm: 18153.331 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1623/ 292968 | consumed samples: 3323904 | consumed tokens: 348356608 | elapsed time per iteration (ms): 104008.9 | learning rate: 8.864E-05 | global batch size: 2048 | lm loss: 4.137870E+00 | loss scale: 32768.0 | grad norm: 22937.124 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1624/ 292968 | consumed samples: 3325952 | consumed tokens: 348667904 | elapsed time per iteration (ms): 108236.9 | learning rate: 8.869E-05 | global batch size: 2048 | lm loss: 4.130649E+00 | loss scale: 32768.0 | grad norm: 22403.226 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1625/ 292968 | consumed samples: 3328000 | consumed tokens: 348979200 | elapsed time per iteration (ms): 105860.9 | learning rate: 8.875E-05 | global batch size: 2048 | lm loss: 4.106955E+00 | loss scale: 32768.0 | grad norm: 13178.490 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1626/ 292968 | consumed samples: 3330048 | consumed tokens: 349290496 | elapsed time per iteration (ms): 107268.5 | learning rate: 8.880E-05 | global batch size: 2048 | lm loss: 4.089630E+00 | loss scale: 32768.0 | grad norm: 14359.568 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1627/ 292968 | consumed samples: 3332096 | consumed tokens: 349601792 | elapsed time per iteration (ms): 104625.1 | learning rate: 8.886E-05 | global batch size: 2048 | lm loss: 4.089586E+00 | loss scale: 32768.0 | grad norm: 15003.323 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1628/ 292968 | consumed samples: 3334144 | consumed tokens: 349913088 | elapsed time per iteration (ms): 108335.8 | learning rate: 8.891E-05 | global batch size: 2048 | lm loss: 4.094872E+00 | loss scale: 32768.0 | grad norm: 16826.565 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1629/ 292968 | consumed samples: 3336192 | consumed tokens: 350224384 | elapsed time per iteration (ms): 108368.8 | learning rate: 8.897E-05 | global batch size: 2048 | lm loss: 4.112906E+00 | loss scale: 32768.0 | grad norm: 14035.168 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1630/ 292968 | consumed samples: 3338240 | consumed tokens: 350535680 | elapsed time per iteration (ms): 104237.2 | learning rate: 8.902E-05 | global batch size: 2048 | lm loss: 4.083397E+00 | loss scale: 32768.0 | grad norm: 13727.543 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1631/ 292968 | consumed samples: 3340288 | consumed tokens: 350846976 | elapsed time per iteration (ms): 105956.5 | learning rate: 8.907E-05 | global batch size: 2048 | lm loss: 4.093054E+00 | loss scale: 32768.0 | grad norm: 16220.623 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1632/ 292968 | consumed samples: 3342336 | consumed tokens: 351158272 | elapsed time per iteration (ms): 105716.3 | learning rate: 8.913E-05 | global batch size: 2048 | lm loss: 4.103983E+00 | loss scale: 32768.0 | grad norm: 16233.268 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1633/ 292968 | consumed samples: 3344384 | consumed tokens: 351469568 | elapsed time per iteration (ms): 103815.6 | learning rate: 8.918E-05 | global batch size: 2048 | lm loss: 4.095228E+00 | loss scale: 32768.0 | grad norm: 22160.800 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1634/ 292968 | consumed samples: 3346432 | consumed tokens: 351780864 | elapsed time per iteration (ms): 104674.8 | learning rate: 8.924E-05 | global batch size: 2048 | lm loss: 4.085284E+00 | loss scale: 32768.0 | grad norm: 25265.108 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1635/ 292968 | consumed samples: 3348480 | consumed tokens: 352092160 | elapsed time per iteration (ms): 102150.6 | learning rate: 8.929E-05 | global batch size: 2048 | lm loss: 4.075352E+00 | loss scale: 32768.0 | grad norm: 28917.330 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1636/ 292968 | consumed samples: 3350528 | consumed tokens: 352403456 | elapsed time per iteration (ms): 105052.2 | learning rate: 8.935E-05 | global batch size: 2048 | lm loss: 4.077106E+00 | loss scale: 32768.0 | grad norm: 25485.838 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1637/ 292968 | consumed samples: 3352576 | consumed tokens: 352714752 | elapsed time per iteration (ms): 104599.7 | learning rate: 8.940E-05 | global batch size: 2048 | lm loss: 4.077329E+00 | loss scale: 32768.0 | grad norm: 12939.066 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1638/ 292968 | consumed samples: 3354624 | consumed tokens: 353026048 | elapsed time per iteration (ms): 105925.1 | learning rate: 8.946E-05 | global batch size: 2048 | lm loss: 4.098464E+00 | loss scale: 32768.0 | grad norm: 20994.227 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1639/ 292968 | consumed samples: 3356672 | consumed tokens: 353337344 | elapsed time per iteration (ms): 104832.0 | learning rate: 8.951E-05 | global batch size: 2048 | lm loss: 4.076411E+00 | loss scale: 32768.0 | grad norm: 31301.422 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1640/ 292968 | consumed samples: 3358720 | consumed tokens: 353648640 | elapsed time per iteration (ms): 105830.1 | learning rate: 8.957E-05 | global batch size: 2048 | lm loss: 4.090681E+00 | loss scale: 32768.0 | grad norm: 28914.570 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1641/ 292968 | consumed samples: 3360768 | consumed tokens: 353959936 | elapsed time per iteration (ms): 105386.6 | learning rate: 8.962E-05 | global batch size: 2048 | lm loss: 4.063982E+00 | loss scale: 32768.0 | grad norm: 26324.044 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1642/ 292968 | consumed samples: 3362816 | consumed tokens: 354271232 | elapsed time per iteration (ms): 105316.8 | learning rate: 8.968E-05 | global batch size: 2048 | lm loss: 4.095941E+00 | loss scale: 32768.0 | grad norm: 29958.070 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1643/ 292968 | consumed samples: 3364864 | consumed tokens: 354582528 | elapsed time per iteration (ms): 105866.7 | learning rate: 8.973E-05 | global batch size: 2048 | lm loss: 4.097448E+00 | loss scale: 32768.0 | grad norm: 24311.547 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1644/ 292968 | consumed samples: 3366912 | consumed tokens: 354893824 | elapsed time per iteration (ms): 102722.2 | learning rate: 8.978E-05 | global batch size: 2048 | lm loss: 4.121556E+00 | loss scale: 32768.0 | grad norm: 22838.440 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1645/ 292968 | consumed samples: 3368960 | consumed tokens: 355205120 | elapsed time per iteration (ms): 102369.3 | learning rate: 8.984E-05 | global batch size: 2048 | lm loss: 4.126900E+00 | loss scale: 32768.0 | grad norm: 15945.380 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1646/ 292968 | consumed samples: 3371008 | consumed tokens: 355516416 | elapsed time per iteration (ms): 102335.5 | learning rate: 8.989E-05 | global batch size: 2048 | lm loss: 4.081250E+00 | loss scale: 32768.0 | grad norm: 16045.356 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1647/ 292968 | consumed samples: 3373056 | consumed tokens: 355827712 | elapsed time per iteration (ms): 104554.4 | learning rate: 8.995E-05 | global batch size: 2048 | lm loss: 4.096787E+00 | loss scale: 32768.0 | grad norm: 14378.990 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1648/ 292968 | consumed samples: 3375104 | consumed tokens: 356139008 | elapsed time per iteration (ms): 103357.9 | learning rate: 9.000E-05 | global batch size: 2048 | lm loss: 4.098947E+00 | loss scale: 32768.0 | grad norm: 11919.239 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1649/ 292968 | consumed samples: 3377152 | consumed tokens: 356450304 | elapsed time per iteration (ms): 104705.1 | learning rate: 9.006E-05 | global batch size: 2048 | lm loss: 4.061591E+00 | loss scale: 32768.0 | grad norm: 12893.261 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1650/ 292968 | consumed samples: 3379200 | consumed tokens: 356761600 | elapsed time per iteration (ms): 103858.0 | learning rate: 9.011E-05 | global batch size: 2048 | lm loss: 4.091815E+00 | loss scale: 32768.0 | grad norm: 12688.032 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 1650 | lm loss value: 4.062670E+00 | lm loss PPL: 5.812930E+01 | ------------------------------------------------------------------------------------------------- - iteration 1651/ 292968 | consumed samples: 3381248 | consumed tokens: 357072896 | elapsed time per iteration (ms): 274026.4 | learning rate: 9.017E-05 | global batch size: 2048 | lm loss: 4.077395E+00 | loss scale: 32768.0 | grad norm: 15553.819 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1652/ 292968 | consumed samples: 3383296 | consumed tokens: 357384192 | elapsed time per iteration (ms): 103866.5 | learning rate: 9.022E-05 | global batch size: 2048 | lm loss: 4.090050E+00 | loss scale: 32768.0 | grad norm: 15226.873 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1653/ 292968 | consumed samples: 3385344 | consumed tokens: 357695488 | elapsed time per iteration (ms): 105850.0 | learning rate: 9.028E-05 | global batch size: 2048 | lm loss: 4.090162E+00 | loss scale: 32768.0 | grad norm: 16051.239 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1654/ 292968 | consumed samples: 3387392 | consumed tokens: 358006784 | elapsed time per iteration (ms): 108469.2 | learning rate: 9.033E-05 | global batch size: 2048 | lm loss: 4.094681E+00 | loss scale: 32768.0 | grad norm: 17659.022 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1655/ 292968 | consumed samples: 3389440 | consumed tokens: 358318080 | elapsed time per iteration (ms): 103511.8 | learning rate: 9.039E-05 | global batch size: 2048 | lm loss: 4.069203E+00 | loss scale: 32768.0 | grad norm: 20180.523 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1656/ 292968 | consumed samples: 3391488 | consumed tokens: 358629376 | elapsed time per iteration (ms): 104712.9 | learning rate: 9.044E-05 | global batch size: 2048 | lm loss: 4.096534E+00 | loss scale: 32768.0 | grad norm: 24005.322 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1657/ 292968 | consumed samples: 3393536 | consumed tokens: 358940672 | elapsed time per iteration (ms): 103778.9 | learning rate: 9.049E-05 | global batch size: 2048 | lm loss: 4.074844E+00 | loss scale: 32768.0 | grad norm: 21064.192 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1658/ 292968 | consumed samples: 3395584 | consumed tokens: 359251968 | elapsed time per iteration (ms): 104971.5 | learning rate: 9.055E-05 | global batch size: 2048 | lm loss: 4.091407E+00 | loss scale: 32768.0 | grad norm: 21737.699 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1659/ 292968 | consumed samples: 3397632 | consumed tokens: 359563264 | elapsed time per iteration (ms): 103786.3 | learning rate: 9.060E-05 | global batch size: 2048 | lm loss: 4.084952E+00 | loss scale: 32768.0 | grad norm: 24927.244 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1660/ 292968 | consumed samples: 3399680 | consumed tokens: 359874560 | elapsed time per iteration (ms): 104442.0 | learning rate: 9.066E-05 | global batch size: 2048 | lm loss: 4.095727E+00 | loss scale: 32768.0 | grad norm: 24157.854 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1661/ 292968 | consumed samples: 3401728 | consumed tokens: 360185856 | elapsed time per iteration (ms): 103759.7 | learning rate: 9.071E-05 | global batch size: 2048 | lm loss: 4.073194E+00 | loss scale: 32768.0 | grad norm: 22588.024 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1662/ 292968 | consumed samples: 3403776 | consumed tokens: 360497152 | elapsed time per iteration (ms): 102736.8 | learning rate: 9.077E-05 | global batch size: 2048 | lm loss: 4.076020E+00 | loss scale: 32768.0 | grad norm: 17796.655 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1663/ 292968 | consumed samples: 3405824 | consumed tokens: 360808448 | elapsed time per iteration (ms): 104171.3 | learning rate: 9.082E-05 | global batch size: 2048 | lm loss: 4.085265E+00 | loss scale: 32768.0 | grad norm: 16153.073 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1664/ 292968 | consumed samples: 3407872 | consumed tokens: 361119744 | elapsed time per iteration (ms): 102943.0 | learning rate: 9.088E-05 | global batch size: 2048 | lm loss: 4.075907E+00 | loss scale: 32768.0 | grad norm: 15372.744 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1665/ 292968 | consumed samples: 3409920 | consumed tokens: 361431040 | elapsed time per iteration (ms): 104235.3 | learning rate: 9.093E-05 | global batch size: 2048 | lm loss: 4.057152E+00 | loss scale: 32768.0 | grad norm: 15702.412 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1666/ 292968 | consumed samples: 3411968 | consumed tokens: 361742336 | elapsed time per iteration (ms): 103808.1 | learning rate: 9.099E-05 | global batch size: 2048 | lm loss: 4.080420E+00 | loss scale: 32768.0 | grad norm: 15882.363 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1667/ 292968 | consumed samples: 3414016 | consumed tokens: 362053632 | elapsed time per iteration (ms): 104057.3 | learning rate: 9.104E-05 | global batch size: 2048 | lm loss: 4.077966E+00 | loss scale: 32768.0 | grad norm: 22408.987 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1668/ 292968 | consumed samples: 3416064 | consumed tokens: 362364928 | elapsed time per iteration (ms): 102642.3 | learning rate: 9.110E-05 | global batch size: 2048 | lm loss: 4.078831E+00 | loss scale: 32768.0 | grad norm: 24623.593 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1669/ 292968 | consumed samples: 3418112 | consumed tokens: 362676224 | elapsed time per iteration (ms): 103366.2 | learning rate: 9.115E-05 | global batch size: 2048 | lm loss: 4.069817E+00 | loss scale: 32768.0 | grad norm: 22502.048 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1670/ 292968 | consumed samples: 3420160 | consumed tokens: 362987520 | elapsed time per iteration (ms): 105120.4 | learning rate: 9.120E-05 | global batch size: 2048 | lm loss: 4.074476E+00 | loss scale: 32768.0 | grad norm: 15940.076 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1671/ 292968 | consumed samples: 3422208 | consumed tokens: 363298816 | elapsed time per iteration (ms): 104161.0 | learning rate: 9.126E-05 | global batch size: 2048 | lm loss: 4.069888E+00 | loss scale: 32768.0 | grad norm: 11064.604 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1672/ 292968 | consumed samples: 3424256 | consumed tokens: 363610112 | elapsed time per iteration (ms): 104738.5 | learning rate: 9.131E-05 | global batch size: 2048 | lm loss: 4.072707E+00 | loss scale: 32768.0 | grad norm: 13357.223 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1673/ 292968 | consumed samples: 3426304 | consumed tokens: 363921408 | elapsed time per iteration (ms): 105441.8 | learning rate: 9.137E-05 | global batch size: 2048 | lm loss: 4.051648E+00 | loss scale: 32768.0 | grad norm: 16233.230 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1674/ 292968 | consumed samples: 3428352 | consumed tokens: 364232704 | elapsed time per iteration (ms): 105440.5 | learning rate: 9.142E-05 | global batch size: 2048 | lm loss: 4.091999E+00 | loss scale: 32768.0 | grad norm: 19121.321 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1675/ 292968 | consumed samples: 3430400 | consumed tokens: 364544000 | elapsed time per iteration (ms): 105606.6 | learning rate: 9.148E-05 | global batch size: 2048 | lm loss: 4.079268E+00 | loss scale: 32768.0 | grad norm: 21691.195 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1676/ 292968 | consumed samples: 3432448 | consumed tokens: 364855296 | elapsed time per iteration (ms): 106061.5 | learning rate: 9.153E-05 | global batch size: 2048 | lm loss: 4.084841E+00 | loss scale: 32768.0 | grad norm: 20336.343 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1677/ 292968 | consumed samples: 3434496 | consumed tokens: 365166592 | elapsed time per iteration (ms): 104631.3 | learning rate: 9.159E-05 | global batch size: 2048 | lm loss: 4.055280E+00 | loss scale: 32768.0 | grad norm: 21637.326 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1678/ 292968 | consumed samples: 3436544 | consumed tokens: 365477888 | elapsed time per iteration (ms): 103087.9 | learning rate: 9.164E-05 | global batch size: 2048 | lm loss: 4.080649E+00 | loss scale: 32768.0 | grad norm: 19833.149 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1679/ 292968 | consumed samples: 3438592 | consumed tokens: 365789184 | elapsed time per iteration (ms): 106848.9 | learning rate: 9.170E-05 | global batch size: 2048 | lm loss: 4.062929E+00 | loss scale: 32768.0 | grad norm: 20125.597 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1680/ 292968 | consumed samples: 3440640 | consumed tokens: 366100480 | elapsed time per iteration (ms): 104510.7 | learning rate: 9.175E-05 | global batch size: 2048 | lm loss: 4.078937E+00 | loss scale: 32768.0 | grad norm: 17836.390 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1681/ 292968 | consumed samples: 3442688 | consumed tokens: 366411776 | elapsed time per iteration (ms): 103843.3 | learning rate: 9.181E-05 | global batch size: 2048 | lm loss: 4.072157E+00 | loss scale: 32768.0 | grad norm: 17488.683 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1682/ 292968 | consumed samples: 3444736 | consumed tokens: 366723072 | elapsed time per iteration (ms): 103057.8 | learning rate: 9.186E-05 | global batch size: 2048 | lm loss: 4.082258E+00 | loss scale: 32768.0 | grad norm: 20319.838 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1683/ 292968 | consumed samples: 3446784 | consumed tokens: 367034368 | elapsed time per iteration (ms): 104778.2 | learning rate: 9.191E-05 | global batch size: 2048 | lm loss: 4.058461E+00 | loss scale: 32768.0 | grad norm: 18419.626 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1684/ 292968 | consumed samples: 3448832 | consumed tokens: 367345664 | elapsed time per iteration (ms): 103318.2 | learning rate: 9.197E-05 | global batch size: 2048 | lm loss: 4.066132E+00 | loss scale: 32768.0 | grad norm: 16366.717 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1685/ 292968 | consumed samples: 3450880 | consumed tokens: 367656960 | elapsed time per iteration (ms): 103929.2 | learning rate: 9.202E-05 | global batch size: 2048 | lm loss: 4.037179E+00 | loss scale: 32768.0 | grad norm: 14918.130 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1686/ 292968 | consumed samples: 3452928 | consumed tokens: 367968256 | elapsed time per iteration (ms): 103796.7 | learning rate: 9.208E-05 | global batch size: 2048 | lm loss: 4.049829E+00 | loss scale: 32768.0 | grad norm: 18425.672 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1687/ 292968 | consumed samples: 3454976 | consumed tokens: 368279552 | elapsed time per iteration (ms): 105536.7 | learning rate: 9.213E-05 | global batch size: 2048 | lm loss: 4.080143E+00 | loss scale: 32768.0 | grad norm: 17810.737 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1688/ 292968 | consumed samples: 3457024 | consumed tokens: 368590848 | elapsed time per iteration (ms): 106264.5 | learning rate: 9.219E-05 | global batch size: 2048 | lm loss: 4.066893E+00 | loss scale: 32768.0 | grad norm: 17929.575 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1689/ 292968 | consumed samples: 3459072 | consumed tokens: 368902144 | elapsed time per iteration (ms): 103090.0 | learning rate: 9.224E-05 | global batch size: 2048 | lm loss: 4.030958E+00 | loss scale: 32768.0 | grad norm: 17288.540 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1690/ 292968 | consumed samples: 3461120 | consumed tokens: 369213440 | elapsed time per iteration (ms): 101686.9 | learning rate: 9.230E-05 | global batch size: 2048 | lm loss: 4.084970E+00 | loss scale: 32768.0 | grad norm: 18570.045 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1691/ 292968 | consumed samples: 3463168 | consumed tokens: 369524736 | elapsed time per iteration (ms): 104181.4 | learning rate: 9.235E-05 | global batch size: 2048 | lm loss: 4.061717E+00 | loss scale: 32768.0 | grad norm: 19511.814 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1692/ 292968 | consumed samples: 3465216 | consumed tokens: 369836032 | elapsed time per iteration (ms): 105037.9 | learning rate: 9.241E-05 | global batch size: 2048 | lm loss: 4.065639E+00 | loss scale: 32768.0 | grad norm: 19089.374 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1693/ 292968 | consumed samples: 3467264 | consumed tokens: 370147328 | elapsed time per iteration (ms): 103825.5 | learning rate: 9.246E-05 | global batch size: 2048 | lm loss: 4.078660E+00 | loss scale: 32768.0 | grad norm: 18888.943 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1694/ 292968 | consumed samples: 3469312 | consumed tokens: 370458624 | elapsed time per iteration (ms): 103234.5 | learning rate: 9.251E-05 | global batch size: 2048 | lm loss: 4.074324E+00 | loss scale: 32768.0 | grad norm: 17564.846 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1695/ 292968 | consumed samples: 3471360 | consumed tokens: 370769920 | elapsed time per iteration (ms): 105302.2 | learning rate: 9.257E-05 | global batch size: 2048 | lm loss: 4.054060E+00 | loss scale: 32768.0 | grad norm: 17131.721 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1696/ 292968 | consumed samples: 3473408 | consumed tokens: 371081216 | elapsed time per iteration (ms): 103540.1 | learning rate: 9.262E-05 | global batch size: 2048 | lm loss: 4.069779E+00 | loss scale: 32768.0 | grad norm: 17957.997 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1697/ 292968 | consumed samples: 3475456 | consumed tokens: 371392512 | elapsed time per iteration (ms): 103568.9 | learning rate: 9.268E-05 | global batch size: 2048 | lm loss: 4.054748E+00 | loss scale: 32768.0 | grad norm: 21461.476 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1698/ 292968 | consumed samples: 3477504 | consumed tokens: 371703808 | elapsed time per iteration (ms): 103168.7 | learning rate: 9.273E-05 | global batch size: 2048 | lm loss: 4.052831E+00 | loss scale: 32768.0 | grad norm: 17904.304 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1699/ 292968 | consumed samples: 3479552 | consumed tokens: 372015104 | elapsed time per iteration (ms): 104187.7 | learning rate: 9.279E-05 | global batch size: 2048 | lm loss: 4.047625E+00 | loss scale: 32768.0 | grad norm: 18401.054 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1700/ 292968 | consumed samples: 3481600 | consumed tokens: 372326400 | elapsed time per iteration (ms): 108280.6 | learning rate: 9.284E-05 | global batch size: 2048 | lm loss: 4.066005E+00 | loss scale: 32768.0 | grad norm: 20260.025 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1701/ 292968 | consumed samples: 3483648 | consumed tokens: 372637696 | elapsed time per iteration (ms): 107462.8 | learning rate: 9.290E-05 | global batch size: 2048 | lm loss: 4.056851E+00 | loss scale: 32768.0 | grad norm: 21935.259 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1702/ 292968 | consumed samples: 3485696 | consumed tokens: 372948992 | elapsed time per iteration (ms): 103131.5 | learning rate: 9.295E-05 | global batch size: 2048 | lm loss: 4.065135E+00 | loss scale: 32768.0 | grad norm: 23087.369 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1703/ 292968 | consumed samples: 3487744 | consumed tokens: 373260288 | elapsed time per iteration (ms): 103384.6 | learning rate: 9.301E-05 | global batch size: 2048 | lm loss: 4.060477E+00 | loss scale: 32768.0 | grad norm: 27990.996 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1704/ 292968 | consumed samples: 3489792 | consumed tokens: 373571584 | elapsed time per iteration (ms): 103046.3 | learning rate: 9.306E-05 | global batch size: 2048 | lm loss: 4.073426E+00 | loss scale: 32768.0 | grad norm: 27638.165 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1705/ 292968 | consumed samples: 3491840 | consumed tokens: 373882880 | elapsed time per iteration (ms): 107130.5 | learning rate: 9.312E-05 | global batch size: 2048 | lm loss: 4.032431E+00 | loss scale: 32768.0 | grad norm: 24565.265 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1706/ 292968 | consumed samples: 3493888 | consumed tokens: 374194176 | elapsed time per iteration (ms): 104413.3 | learning rate: 9.317E-05 | global batch size: 2048 | lm loss: 4.057915E+00 | loss scale: 32768.0 | grad norm: 17639.611 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1707/ 292968 | consumed samples: 3495936 | consumed tokens: 374505472 | elapsed time per iteration (ms): 101077.0 | learning rate: 9.322E-05 | global batch size: 2048 | lm loss: 4.049128E+00 | loss scale: 32768.0 | grad norm: 13095.123 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1708/ 292968 | consumed samples: 3497984 | consumed tokens: 374816768 | elapsed time per iteration (ms): 102924.3 | learning rate: 9.328E-05 | global batch size: 2048 | lm loss: 4.037101E+00 | loss scale: 32768.0 | grad norm: 15349.775 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1709/ 292968 | consumed samples: 3500032 | consumed tokens: 375128064 | elapsed time per iteration (ms): 103413.1 | learning rate: 9.333E-05 | global batch size: 2048 | lm loss: 4.032069E+00 | loss scale: 32768.0 | grad norm: 16122.121 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1710/ 292968 | consumed samples: 3502080 | consumed tokens: 375439360 | elapsed time per iteration (ms): 104275.5 | learning rate: 9.339E-05 | global batch size: 2048 | lm loss: 4.063254E+00 | loss scale: 32768.0 | grad norm: 17741.583 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1711/ 292968 | consumed samples: 3504128 | consumed tokens: 375750656 | elapsed time per iteration (ms): 104601.0 | learning rate: 9.344E-05 | global batch size: 2048 | lm loss: 4.061349E+00 | loss scale: 32768.0 | grad norm: 17927.963 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1712/ 292968 | consumed samples: 3506176 | consumed tokens: 376061952 | elapsed time per iteration (ms): 104530.8 | learning rate: 9.350E-05 | global batch size: 2048 | lm loss: 4.075251E+00 | loss scale: 32768.0 | grad norm: 18028.516 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1713/ 292968 | consumed samples: 3508224 | consumed tokens: 376373248 | elapsed time per iteration (ms): 103866.1 | learning rate: 9.355E-05 | global batch size: 2048 | lm loss: 4.043147E+00 | loss scale: 32768.0 | grad norm: 18270.574 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1714/ 292968 | consumed samples: 3510272 | consumed tokens: 376684544 | elapsed time per iteration (ms): 102394.4 | learning rate: 9.361E-05 | global batch size: 2048 | lm loss: 4.055283E+00 | loss scale: 32768.0 | grad norm: 18001.035 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1715/ 292968 | consumed samples: 3512320 | consumed tokens: 376995840 | elapsed time per iteration (ms): 104355.2 | learning rate: 9.366E-05 | global batch size: 2048 | lm loss: 4.058062E+00 | loss scale: 32768.0 | grad norm: 20872.989 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1716/ 292968 | consumed samples: 3514368 | consumed tokens: 377307136 | elapsed time per iteration (ms): 103000.6 | learning rate: 9.372E-05 | global batch size: 2048 | lm loss: 4.063187E+00 | loss scale: 32768.0 | grad norm: 20769.261 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1717/ 292968 | consumed samples: 3516416 | consumed tokens: 377618432 | elapsed time per iteration (ms): 104997.1 | learning rate: 9.377E-05 | global batch size: 2048 | lm loss: 4.064139E+00 | loss scale: 32768.0 | grad norm: 16050.391 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1718/ 292968 | consumed samples: 3518464 | consumed tokens: 377929728 | elapsed time per iteration (ms): 104096.1 | learning rate: 9.383E-05 | global batch size: 2048 | lm loss: 4.059897E+00 | loss scale: 32768.0 | grad norm: 18187.884 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1719/ 292968 | consumed samples: 3520512 | consumed tokens: 378241024 | elapsed time per iteration (ms): 102064.3 | learning rate: 9.388E-05 | global batch size: 2048 | lm loss: 4.044410E+00 | loss scale: 32768.0 | grad norm: 17084.383 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1720/ 292968 | consumed samples: 3522560 | consumed tokens: 378552320 | elapsed time per iteration (ms): 105880.7 | learning rate: 9.393E-05 | global batch size: 2048 | lm loss: 4.070328E+00 | loss scale: 32768.0 | grad norm: 13024.504 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1721/ 292968 | consumed samples: 3524608 | consumed tokens: 378863616 | elapsed time per iteration (ms): 104298.1 | learning rate: 9.399E-05 | global batch size: 2048 | lm loss: 4.031842E+00 | loss scale: 32768.0 | grad norm: 15876.680 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1722/ 292968 | consumed samples: 3526656 | consumed tokens: 379174912 | elapsed time per iteration (ms): 102440.2 | learning rate: 9.404E-05 | global batch size: 2048 | lm loss: 4.070487E+00 | loss scale: 32768.0 | grad norm: 21407.903 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1723/ 292968 | consumed samples: 3528704 | consumed tokens: 379486208 | elapsed time per iteration (ms): 103911.1 | learning rate: 9.410E-05 | global batch size: 2048 | lm loss: 4.072162E+00 | loss scale: 32768.0 | grad norm: 23464.721 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1724/ 292968 | consumed samples: 3530752 | consumed tokens: 379797504 | elapsed time per iteration (ms): 105244.1 | learning rate: 9.415E-05 | global batch size: 2048 | lm loss: 4.069732E+00 | loss scale: 32768.0 | grad norm: 25086.912 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1725/ 292968 | consumed samples: 3532800 | consumed tokens: 380108800 | elapsed time per iteration (ms): 104812.8 | learning rate: 9.421E-05 | global batch size: 2048 | lm loss: 4.038998E+00 | loss scale: 32768.0 | grad norm: 18108.041 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1726/ 292968 | consumed samples: 3534848 | consumed tokens: 380420096 | elapsed time per iteration (ms): 103815.4 | learning rate: 9.426E-05 | global batch size: 2048 | lm loss: 4.080314E+00 | loss scale: 32768.0 | grad norm: 18252.466 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1727/ 292968 | consumed samples: 3536896 | consumed tokens: 380731392 | elapsed time per iteration (ms): 104170.0 | learning rate: 9.432E-05 | global batch size: 2048 | lm loss: 4.069029E+00 | loss scale: 32768.0 | grad norm: 16820.112 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1728/ 292968 | consumed samples: 3538944 | consumed tokens: 381042688 | elapsed time per iteration (ms): 105287.8 | learning rate: 9.437E-05 | global batch size: 2048 | lm loss: 4.060335E+00 | loss scale: 32768.0 | grad norm: 15671.310 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1729/ 292968 | consumed samples: 3540992 | consumed tokens: 381353984 | elapsed time per iteration (ms): 104935.8 | learning rate: 9.443E-05 | global batch size: 2048 | lm loss: 4.069222E+00 | loss scale: 32768.0 | grad norm: 15640.061 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1730/ 292968 | consumed samples: 3543040 | consumed tokens: 381665280 | elapsed time per iteration (ms): 104154.1 | learning rate: 9.448E-05 | global batch size: 2048 | lm loss: 4.028211E+00 | loss scale: 32768.0 | grad norm: 18999.045 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1731/ 292968 | consumed samples: 3545088 | consumed tokens: 381976576 | elapsed time per iteration (ms): 103121.6 | learning rate: 9.454E-05 | global batch size: 2048 | lm loss: 4.041728E+00 | loss scale: 32768.0 | grad norm: 20568.738 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1732/ 292968 | consumed samples: 3547136 | consumed tokens: 382287872 | elapsed time per iteration (ms): 103176.2 | learning rate: 9.459E-05 | global batch size: 2048 | lm loss: 4.037498E+00 | loss scale: 32768.0 | grad norm: 25422.595 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1733/ 292968 | consumed samples: 3549184 | consumed tokens: 382599168 | elapsed time per iteration (ms): 104028.6 | learning rate: 9.464E-05 | global batch size: 2048 | lm loss: 4.073191E+00 | loss scale: 32768.0 | grad norm: 24540.738 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1734/ 292968 | consumed samples: 3551232 | consumed tokens: 382910464 | elapsed time per iteration (ms): 105397.3 | learning rate: 9.470E-05 | global batch size: 2048 | lm loss: 4.065447E+00 | loss scale: 32768.0 | grad norm: 18126.665 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1735/ 292968 | consumed samples: 3553280 | consumed tokens: 383221760 | elapsed time per iteration (ms): 103424.7 | learning rate: 9.475E-05 | global batch size: 2048 | lm loss: 4.028213E+00 | loss scale: 32768.0 | grad norm: 22430.829 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1736/ 292968 | consumed samples: 3555328 | consumed tokens: 383533056 | elapsed time per iteration (ms): 104211.8 | learning rate: 9.481E-05 | global batch size: 2048 | lm loss: 4.053311E+00 | loss scale: 32768.0 | grad norm: 24156.605 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1737/ 292968 | consumed samples: 3557376 | consumed tokens: 383844352 | elapsed time per iteration (ms): 104128.9 | learning rate: 9.486E-05 | global batch size: 2048 | lm loss: 4.075110E+00 | loss scale: 32768.0 | grad norm: 18519.497 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1738/ 292968 | consumed samples: 3559424 | consumed tokens: 384155648 | elapsed time per iteration (ms): 104285.7 | learning rate: 9.492E-05 | global batch size: 2048 | lm loss: 4.059368E+00 | loss scale: 32768.0 | grad norm: 15976.773 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1739/ 292968 | consumed samples: 3561472 | consumed tokens: 384466944 | elapsed time per iteration (ms): 102906.6 | learning rate: 9.497E-05 | global batch size: 2048 | lm loss: 4.048221E+00 | loss scale: 32768.0 | grad norm: 19892.787 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1740/ 292968 | consumed samples: 3563520 | consumed tokens: 384778240 | elapsed time per iteration (ms): 103698.6 | learning rate: 9.503E-05 | global batch size: 2048 | lm loss: 4.063043E+00 | loss scale: 32768.0 | grad norm: 24953.420 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1741/ 292968 | consumed samples: 3565568 | consumed tokens: 385089536 | elapsed time per iteration (ms): 102848.2 | learning rate: 9.508E-05 | global batch size: 2048 | lm loss: 4.035288E+00 | loss scale: 32768.0 | grad norm: 26943.940 | num zeros: 0.0 | curriculum seqlen: 152 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1742/ 292968 | consumed samples: 3567616 | consumed tokens: 385417216 | elapsed time per iteration (ms): 105021.7 | learning rate: 9.514E-05 | global batch size: 2048 | lm loss: 4.130017E+00 | loss scale: 32768.0 | grad norm: 26173.774 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1743/ 292968 | consumed samples: 3569664 | consumed tokens: 385744896 | elapsed time per iteration (ms): 105693.2 | learning rate: 9.519E-05 | global batch size: 2048 | lm loss: 4.125850E+00 | loss scale: 32768.0 | grad norm: 18763.334 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1744/ 292968 | consumed samples: 3571712 | consumed tokens: 386072576 | elapsed time per iteration (ms): 104704.6 | learning rate: 9.525E-05 | global batch size: 2048 | lm loss: 4.159922E+00 | loss scale: 32768.0 | grad norm: 25572.155 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1745/ 292968 | consumed samples: 3573760 | consumed tokens: 386400256 | elapsed time per iteration (ms): 104895.0 | learning rate: 9.530E-05 | global batch size: 2048 | lm loss: 4.114259E+00 | loss scale: 32768.0 | grad norm: 26949.425 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1746/ 292968 | consumed samples: 3575808 | consumed tokens: 386727936 | elapsed time per iteration (ms): 105381.3 | learning rate: 9.535E-05 | global batch size: 2048 | lm loss: 4.107590E+00 | loss scale: 32768.0 | grad norm: 26567.732 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1747/ 292968 | consumed samples: 3577856 | consumed tokens: 387055616 | elapsed time per iteration (ms): 103854.0 | learning rate: 9.541E-05 | global batch size: 2048 | lm loss: 4.054446E+00 | loss scale: 32768.0 | grad norm: 29392.670 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1748/ 292968 | consumed samples: 3579904 | consumed tokens: 387383296 | elapsed time per iteration (ms): 104951.3 | learning rate: 9.546E-05 | global batch size: 2048 | lm loss: 4.088350E+00 | loss scale: 32768.0 | grad norm: 26226.130 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1749/ 292968 | consumed samples: 3581952 | consumed tokens: 387710976 | elapsed time per iteration (ms): 103296.0 | learning rate: 9.552E-05 | global batch size: 2048 | lm loss: 4.098500E+00 | loss scale: 32768.0 | grad norm: 25561.708 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1750/ 292968 | consumed samples: 3584000 | consumed tokens: 388038656 | elapsed time per iteration (ms): 105675.5 | learning rate: 9.557E-05 | global batch size: 2048 | lm loss: 4.091243E+00 | loss scale: 32768.0 | grad norm: 20497.761 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1751/ 292968 | consumed samples: 3586048 | consumed tokens: 388366336 | elapsed time per iteration (ms): 104350.1 | learning rate: 9.563E-05 | global batch size: 2048 | lm loss: 4.088629E+00 | loss scale: 32768.0 | grad norm: 19994.456 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1752/ 292968 | consumed samples: 3588096 | consumed tokens: 388694016 | elapsed time per iteration (ms): 105499.2 | learning rate: 9.568E-05 | global batch size: 2048 | lm loss: 4.054234E+00 | loss scale: 32768.0 | grad norm: 18374.245 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1753/ 292968 | consumed samples: 3590144 | consumed tokens: 389021696 | elapsed time per iteration (ms): 104177.2 | learning rate: 9.574E-05 | global batch size: 2048 | lm loss: 4.039162E+00 | loss scale: 32768.0 | grad norm: 21533.347 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1754/ 292968 | consumed samples: 3592192 | consumed tokens: 389349376 | elapsed time per iteration (ms): 105649.0 | learning rate: 9.579E-05 | global batch size: 2048 | lm loss: 4.083185E+00 | loss scale: 32768.0 | grad norm: 21724.018 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1755/ 292968 | consumed samples: 3594240 | consumed tokens: 389677056 | elapsed time per iteration (ms): 100450.8 | learning rate: 9.585E-05 | global batch size: 2048 | lm loss: 4.058789E+00 | loss scale: 32768.0 | grad norm: 12643.674 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1756/ 292968 | consumed samples: 3596288 | consumed tokens: 390004736 | elapsed time per iteration (ms): 105244.2 | learning rate: 9.590E-05 | global batch size: 2048 | lm loss: 4.017322E+00 | loss scale: 32768.0 | grad norm: 15053.510 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1757/ 292968 | consumed samples: 3598336 | consumed tokens: 390332416 | elapsed time per iteration (ms): 104006.8 | learning rate: 9.596E-05 | global batch size: 2048 | lm loss: 4.020484E+00 | loss scale: 32768.0 | grad norm: 17180.553 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1758/ 292968 | consumed samples: 3600384 | consumed tokens: 390660096 | elapsed time per iteration (ms): 105813.0 | learning rate: 9.601E-05 | global batch size: 2048 | lm loss: 4.025072E+00 | loss scale: 32768.0 | grad norm: 15750.183 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1759/ 292968 | consumed samples: 3602432 | consumed tokens: 390987776 | elapsed time per iteration (ms): 104517.2 | learning rate: 9.606E-05 | global batch size: 2048 | lm loss: 4.040825E+00 | loss scale: 32768.0 | grad norm: 14238.547 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1760/ 292968 | consumed samples: 3604480 | consumed tokens: 391315456 | elapsed time per iteration (ms): 104145.6 | learning rate: 9.612E-05 | global batch size: 2048 | lm loss: 4.042304E+00 | loss scale: 32768.0 | grad norm: 13840.260 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1761/ 292968 | consumed samples: 3606528 | consumed tokens: 391643136 | elapsed time per iteration (ms): 104353.9 | learning rate: 9.617E-05 | global batch size: 2048 | lm loss: 4.025782E+00 | loss scale: 32768.0 | grad norm: 14593.991 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1762/ 292968 | consumed samples: 3608576 | consumed tokens: 391970816 | elapsed time per iteration (ms): 103546.2 | learning rate: 9.623E-05 | global batch size: 2048 | lm loss: 4.011271E+00 | loss scale: 32768.0 | grad norm: 16213.931 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1763/ 292968 | consumed samples: 3610624 | consumed tokens: 392298496 | elapsed time per iteration (ms): 104225.9 | learning rate: 9.628E-05 | global batch size: 2048 | lm loss: 4.074361E+00 | loss scale: 32768.0 | grad norm: 16520.734 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1764/ 292968 | consumed samples: 3612672 | consumed tokens: 392626176 | elapsed time per iteration (ms): 105029.5 | learning rate: 9.634E-05 | global batch size: 2048 | lm loss: 4.023820E+00 | loss scale: 32768.0 | grad norm: 13296.735 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1765/ 292968 | consumed samples: 3614720 | consumed tokens: 392953856 | elapsed time per iteration (ms): 100437.3 | learning rate: 9.639E-05 | global batch size: 2048 | lm loss: 4.025954E+00 | loss scale: 32768.0 | grad norm: 16868.932 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1766/ 292968 | consumed samples: 3616768 | consumed tokens: 393281536 | elapsed time per iteration (ms): 103088.2 | learning rate: 9.645E-05 | global batch size: 2048 | lm loss: 4.048180E+00 | loss scale: 32768.0 | grad norm: 22196.603 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1767/ 292968 | consumed samples: 3618816 | consumed tokens: 393609216 | elapsed time per iteration (ms): 103998.6 | learning rate: 9.650E-05 | global batch size: 2048 | lm loss: 4.034055E+00 | loss scale: 32768.0 | grad norm: 16510.172 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1768/ 292968 | consumed samples: 3620864 | consumed tokens: 393936896 | elapsed time per iteration (ms): 105686.8 | learning rate: 9.656E-05 | global batch size: 2048 | lm loss: 4.027907E+00 | loss scale: 32768.0 | grad norm: 22722.776 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1769/ 292968 | consumed samples: 3622912 | consumed tokens: 394264576 | elapsed time per iteration (ms): 107160.9 | learning rate: 9.661E-05 | global batch size: 2048 | lm loss: 4.009619E+00 | loss scale: 32768.0 | grad norm: 20594.360 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1770/ 292968 | consumed samples: 3624960 | consumed tokens: 394592256 | elapsed time per iteration (ms): 103221.1 | learning rate: 9.667E-05 | global batch size: 2048 | lm loss: 4.055212E+00 | loss scale: 32768.0 | grad norm: 22058.541 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1771/ 292968 | consumed samples: 3627008 | consumed tokens: 394919936 | elapsed time per iteration (ms): 106783.7 | learning rate: 9.672E-05 | global batch size: 2048 | lm loss: 4.024175E+00 | loss scale: 32768.0 | grad norm: 22477.550 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1772/ 292968 | consumed samples: 3629056 | consumed tokens: 395247616 | elapsed time per iteration (ms): 103544.3 | learning rate: 9.677E-05 | global batch size: 2048 | lm loss: 4.018074E+00 | loss scale: 32768.0 | grad norm: 16959.428 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1773/ 292968 | consumed samples: 3631104 | consumed tokens: 395575296 | elapsed time per iteration (ms): 104688.1 | learning rate: 9.683E-05 | global batch size: 2048 | lm loss: 4.030293E+00 | loss scale: 32768.0 | grad norm: 17157.786 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1774/ 292968 | consumed samples: 3633152 | consumed tokens: 395902976 | elapsed time per iteration (ms): 105136.2 | learning rate: 9.688E-05 | global batch size: 2048 | lm loss: 4.047806E+00 | loss scale: 32768.0 | grad norm: 19579.199 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1775/ 292968 | consumed samples: 3635200 | consumed tokens: 396230656 | elapsed time per iteration (ms): 104609.8 | learning rate: 9.694E-05 | global batch size: 2048 | lm loss: 4.031793E+00 | loss scale: 32768.0 | grad norm: 18619.100 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1776/ 292968 | consumed samples: 3637248 | consumed tokens: 396558336 | elapsed time per iteration (ms): 105852.0 | learning rate: 9.699E-05 | global batch size: 2048 | lm loss: 4.006382E+00 | loss scale: 32768.0 | grad norm: 13446.578 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1777/ 292968 | consumed samples: 3639296 | consumed tokens: 396886016 | elapsed time per iteration (ms): 103760.4 | learning rate: 9.705E-05 | global batch size: 2048 | lm loss: 4.015323E+00 | loss scale: 32768.0 | grad norm: 15053.467 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1778/ 292968 | consumed samples: 3641344 | consumed tokens: 397213696 | elapsed time per iteration (ms): 102733.4 | learning rate: 9.710E-05 | global batch size: 2048 | lm loss: 4.001670E+00 | loss scale: 32768.0 | grad norm: 23635.791 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1779/ 292968 | consumed samples: 3643392 | consumed tokens: 397541376 | elapsed time per iteration (ms): 104049.0 | learning rate: 9.716E-05 | global batch size: 2048 | lm loss: 4.023041E+00 | loss scale: 32768.0 | grad norm: 32334.136 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1780/ 292968 | consumed samples: 3645440 | consumed tokens: 397869056 | elapsed time per iteration (ms): 104878.5 | learning rate: 9.721E-05 | global batch size: 2048 | lm loss: 4.011945E+00 | loss scale: 32768.0 | grad norm: 24107.921 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1781/ 292968 | consumed samples: 3647488 | consumed tokens: 398196736 | elapsed time per iteration (ms): 104330.0 | learning rate: 9.727E-05 | global batch size: 2048 | lm loss: 4.006859E+00 | loss scale: 32768.0 | grad norm: 21357.836 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1782/ 292968 | consumed samples: 3649536 | consumed tokens: 398524416 | elapsed time per iteration (ms): 104742.8 | learning rate: 9.732E-05 | global batch size: 2048 | lm loss: 4.002854E+00 | loss scale: 32768.0 | grad norm: 22121.852 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1783/ 292968 | consumed samples: 3651584 | consumed tokens: 398852096 | elapsed time per iteration (ms): 103715.4 | learning rate: 9.738E-05 | global batch size: 2048 | lm loss: 4.009685E+00 | loss scale: 32768.0 | grad norm: 16257.049 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1784/ 292968 | consumed samples: 3653632 | consumed tokens: 399179776 | elapsed time per iteration (ms): 104839.0 | learning rate: 9.743E-05 | global batch size: 2048 | lm loss: 4.036745E+00 | loss scale: 32768.0 | grad norm: 17643.676 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1785/ 292968 | consumed samples: 3655680 | consumed tokens: 399507456 | elapsed time per iteration (ms): 103374.0 | learning rate: 9.748E-05 | global batch size: 2048 | lm loss: 4.018547E+00 | loss scale: 32768.0 | grad norm: 14944.843 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1786/ 292968 | consumed samples: 3657728 | consumed tokens: 399835136 | elapsed time per iteration (ms): 104142.6 | learning rate: 9.754E-05 | global batch size: 2048 | lm loss: 4.013638E+00 | loss scale: 32768.0 | grad norm: 13711.308 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1787/ 292968 | consumed samples: 3659776 | consumed tokens: 400162816 | elapsed time per iteration (ms): 101592.2 | learning rate: 9.759E-05 | global batch size: 2048 | lm loss: 4.009739E+00 | loss scale: 32768.0 | grad norm: 15181.037 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1788/ 292968 | consumed samples: 3661824 | consumed tokens: 400490496 | elapsed time per iteration (ms): 103650.9 | learning rate: 9.765E-05 | global batch size: 2048 | lm loss: 4.028801E+00 | loss scale: 32768.0 | grad norm: 14446.078 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1789/ 292968 | consumed samples: 3663872 | consumed tokens: 400818176 | elapsed time per iteration (ms): 104383.3 | learning rate: 9.770E-05 | global batch size: 2048 | lm loss: 4.005740E+00 | loss scale: 32768.0 | grad norm: 15381.889 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1790/ 292968 | consumed samples: 3665920 | consumed tokens: 401145856 | elapsed time per iteration (ms): 103379.7 | learning rate: 9.776E-05 | global batch size: 2048 | lm loss: 3.985616E+00 | loss scale: 32768.0 | grad norm: 20385.702 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1791/ 292968 | consumed samples: 3667968 | consumed tokens: 401473536 | elapsed time per iteration (ms): 103039.5 | learning rate: 9.781E-05 | global batch size: 2048 | lm loss: 4.021649E+00 | loss scale: 32768.0 | grad norm: 23358.806 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1792/ 292968 | consumed samples: 3670016 | consumed tokens: 401801216 | elapsed time per iteration (ms): 104568.2 | learning rate: 9.787E-05 | global batch size: 2048 | lm loss: 4.014658E+00 | loss scale: 32768.0 | grad norm: 22290.455 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1793/ 292968 | consumed samples: 3672064 | consumed tokens: 402128896 | elapsed time per iteration (ms): 105118.1 | learning rate: 9.792E-05 | global batch size: 2048 | lm loss: 4.010005E+00 | loss scale: 32768.0 | grad norm: 22001.765 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1794/ 292968 | consumed samples: 3674112 | consumed tokens: 402456576 | elapsed time per iteration (ms): 103836.1 | learning rate: 9.798E-05 | global batch size: 2048 | lm loss: 4.047166E+00 | loss scale: 32768.0 | grad norm: 16288.266 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1795/ 292968 | consumed samples: 3676160 | consumed tokens: 402784256 | elapsed time per iteration (ms): 103950.0 | learning rate: 9.803E-05 | global batch size: 2048 | lm loss: 4.014968E+00 | loss scale: 32768.0 | grad norm: 13696.969 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1796/ 292968 | consumed samples: 3678208 | consumed tokens: 403111936 | elapsed time per iteration (ms): 102830.3 | learning rate: 9.809E-05 | global batch size: 2048 | lm loss: 4.014853E+00 | loss scale: 32768.0 | grad norm: 17161.664 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1797/ 292968 | consumed samples: 3680256 | consumed tokens: 403439616 | elapsed time per iteration (ms): 103787.6 | learning rate: 9.814E-05 | global batch size: 2048 | lm loss: 4.021245E+00 | loss scale: 32768.0 | grad norm: 14841.216 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1798/ 292968 | consumed samples: 3682304 | consumed tokens: 403767296 | elapsed time per iteration (ms): 104101.3 | learning rate: 9.819E-05 | global batch size: 2048 | lm loss: 4.009190E+00 | loss scale: 32768.0 | grad norm: 13968.024 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1799/ 292968 | consumed samples: 3684352 | consumed tokens: 404094976 | elapsed time per iteration (ms): 101799.6 | learning rate: 9.825E-05 | global batch size: 2048 | lm loss: 4.003065E+00 | loss scale: 32768.0 | grad norm: 14235.084 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1800/ 292968 | consumed samples: 3686400 | consumed tokens: 404422656 | elapsed time per iteration (ms): 106302.7 | learning rate: 9.830E-05 | global batch size: 2048 | lm loss: 4.014077E+00 | loss scale: 32768.0 | grad norm: 14045.171 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 1800 | lm loss value: 4.000999E+00 | lm loss PPL: 5.465275E+01 | ------------------------------------------------------------------------------------------------- -saving checkpoint at iteration 1800 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-27 01:57:14,806] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/mp_rank_01_model_states.pt -[2021-10-27 01:57:15,049] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/mp_rank_00_model_states.pt -[2021-10-27 01:57:27,834] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-27 01:57:27,862] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-27 01:57:27,962] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-27 01:57:27,992] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-27 01:57:28,016] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-27 01:57:28,054] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-27 01:57:28,095] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-27 01:57:28,099] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-27 01:57:28,111] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-27 01:57:28,128] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-27 01:57:28,155] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-27 01:57:28,167] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-27 01:57:28,181] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-27 01:57:28,181] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-27 01:57:28,198] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-27 01:57:28,216] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-27 01:57:28,232] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-27 01:57:28,244] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-27 01:57:28,246] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-27 01:57:28,247] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-27 01:57:28,254] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-27 01:57:28,271] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-27 01:57:28,278] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-27 01:57:28,319] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-27 01:57:28,335] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-27 01:57:28,340] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-27 01:57:28,364] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-27 01:57:28,366] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-27 01:57:28,418] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-27 01:57:28,583] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-27 01:57:28,610] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-27 01:57:28,687] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-27 01:57:28,746] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-27 01:57:28,971] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-27 01:57:28,986] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-27 01:57:29,011] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-27 01:57:29,021] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-27 01:57:29,048] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-27 01:57:29,053] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-27 01:57:29,068] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-27 01:57:29,082] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-27 01:57:29,091] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-27 01:57:29,095] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-27 01:57:29,115] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-27 01:57:29,121] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-27 01:57:29,125] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-27 01:57:29,127] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-27 01:57:29,131] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-27 01:57:29,140] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-27 01:57:29,144] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-27 01:57:29,145] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-27 01:57:29,148] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-27 01:57:29,166] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-27 01:57:29,220] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-27 01:57:29,232] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-27 01:57:29,253] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-27 01:57:29,262] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-27 01:57:29,303] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-27 01:57:29,313] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-27 01:57:29,339] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-27 01:57:29,355] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-27 01:57:29,371] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-27 01:57:29,397] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-27 01:57:29,431] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-27 01:57:29,434] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-27 01:57:29,442] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-27 01:57:29,469] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-27 01:57:29,489] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-27 01:57:29,500] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-27 01:57:29,513] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-27 01:57:29,519] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-27 01:57:29,523] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-27 01:57:29,541] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-27 01:57:29,543] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-27 01:57:29,554] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-27 01:57:29,567] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-27 01:57:29,584] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-27 01:57:29,599] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-27 01:57:29,632] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-27 01:57:29,643] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-27 01:57:29,646] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-27 01:57:29,665] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-27 01:57:29,700] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-27 01:57:29,718] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-27 01:57:29,749] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-27 01:57:29,783] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-27 01:57:29,789] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-27 01:57:29,829] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-27 01:57:29,845] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-27 01:57:29,877] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-27 01:57:29,880] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-27 01:57:29,883] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-27 01:57:29,915] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-27 01:57:29,924] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-27 01:57:29,945] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-27 01:57:29,954] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-27 01:57:29,995] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-27 01:57:30,001] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-27 01:57:30,003] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-27 01:57:30,046] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-27 01:57:30,069] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-27 01:57:30,078] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-27 01:57:30,136] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-27 01:57:30,152] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-27 01:57:30,243] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-27 01:57:30,276] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-27 01:57:30,283] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-27 01:57:30,328] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-27 01:57:30,367] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-27 01:57:30,386] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-27 01:57:30,386] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-27 01:57:30,511] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-27 01:57:30,736] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-27 01:57:30,747] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-27 01:57:30,856] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-27 01:57:31,005] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-27 01:57:32,027] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-27 01:57:32,083] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-27 01:57:32,240] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-27 01:57:32,296] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-27 01:57:32,513] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-27 01:57:33,936] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-27 01:57:34,063] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-27 01:57:34,695] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-27 01:57:35,818] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-27 01:57:36,371] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-27 01:57:36,764] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-27 01:57:37,734] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step1800/zero_pp_rank_0_mp_rank_30_optim_states.pt - successfully saved checkpoint at iteration 1800 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 25742.57 - iteration 1801/ 292968 | consumed samples: 3688448 | consumed tokens: 404750336 | elapsed time per iteration (ms): 319509.3 | learning rate: 9.836E-05 | global batch size: 2048 | lm loss: 4.046515E+00 | loss scale: 32768.0 | grad norm: 14063.731 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1802/ 292968 | consumed samples: 3690496 | consumed tokens: 405078016 | elapsed time per iteration (ms): 103858.0 | learning rate: 9.841E-05 | global batch size: 2048 | lm loss: 3.965111E+00 | loss scale: 32768.0 | grad norm: 18659.714 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1803/ 292968 | consumed samples: 3692544 | consumed tokens: 405405696 | elapsed time per iteration (ms): 102894.6 | learning rate: 9.847E-05 | global batch size: 2048 | lm loss: 4.019529E+00 | loss scale: 32768.0 | grad norm: 19065.366 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1804/ 292968 | consumed samples: 3694592 | consumed tokens: 405733376 | elapsed time per iteration (ms): 105693.0 | learning rate: 9.852E-05 | global batch size: 2048 | lm loss: 4.016005E+00 | loss scale: 32768.0 | grad norm: 17837.264 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1805/ 292968 | consumed samples: 3696640 | consumed tokens: 406061056 | elapsed time per iteration (ms): 109231.4 | learning rate: 9.858E-05 | global batch size: 2048 | lm loss: 4.019568E+00 | loss scale: 32768.0 | grad norm: 15688.023 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1806/ 292968 | consumed samples: 3698688 | consumed tokens: 406388736 | elapsed time per iteration (ms): 104768.2 | learning rate: 9.863E-05 | global batch size: 2048 | lm loss: 4.016542E+00 | loss scale: 32768.0 | grad norm: 22634.235 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1807/ 292968 | consumed samples: 3700736 | consumed tokens: 406716416 | elapsed time per iteration (ms): 105375.6 | learning rate: 9.869E-05 | global batch size: 2048 | lm loss: 4.018755E+00 | loss scale: 32768.0 | grad norm: 27630.477 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1808/ 292968 | consumed samples: 3702784 | consumed tokens: 407044096 | elapsed time per iteration (ms): 107148.8 | learning rate: 9.874E-05 | global batch size: 2048 | lm loss: 4.014750E+00 | loss scale: 32768.0 | grad norm: 29592.805 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1809/ 292968 | consumed samples: 3704832 | consumed tokens: 407371776 | elapsed time per iteration (ms): 103943.7 | learning rate: 9.880E-05 | global batch size: 2048 | lm loss: 4.004914E+00 | loss scale: 32768.0 | grad norm: 23254.375 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1810/ 292968 | consumed samples: 3706880 | consumed tokens: 407699456 | elapsed time per iteration (ms): 107724.5 | learning rate: 9.885E-05 | global batch size: 2048 | lm loss: 4.020401E+00 | loss scale: 32768.0 | grad norm: 18910.132 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1811/ 292968 | consumed samples: 3708928 | consumed tokens: 408027136 | elapsed time per iteration (ms): 103335.0 | learning rate: 9.890E-05 | global batch size: 2048 | lm loss: 4.008213E+00 | loss scale: 32768.0 | grad norm: 23252.347 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1812/ 292968 | consumed samples: 3710976 | consumed tokens: 408354816 | elapsed time per iteration (ms): 103840.4 | learning rate: 9.896E-05 | global batch size: 2048 | lm loss: 4.003132E+00 | loss scale: 32768.0 | grad norm: 16887.177 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1813/ 292968 | consumed samples: 3713024 | consumed tokens: 408682496 | elapsed time per iteration (ms): 106040.8 | learning rate: 9.901E-05 | global batch size: 2048 | lm loss: 3.998968E+00 | loss scale: 32768.0 | grad norm: 16284.716 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1814/ 292968 | consumed samples: 3715072 | consumed tokens: 409010176 | elapsed time per iteration (ms): 104600.7 | learning rate: 9.907E-05 | global batch size: 2048 | lm loss: 4.016735E+00 | loss scale: 32768.0 | grad norm: 16993.964 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1815/ 292968 | consumed samples: 3717120 | consumed tokens: 409337856 | elapsed time per iteration (ms): 103347.3 | learning rate: 9.912E-05 | global batch size: 2048 | lm loss: 3.967012E+00 | loss scale: 32768.0 | grad norm: 15456.159 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1816/ 292968 | consumed samples: 3719168 | consumed tokens: 409665536 | elapsed time per iteration (ms): 104765.1 | learning rate: 9.918E-05 | global batch size: 2048 | lm loss: 4.006498E+00 | loss scale: 32768.0 | grad norm: 17697.007 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1817/ 292968 | consumed samples: 3721216 | consumed tokens: 409993216 | elapsed time per iteration (ms): 105294.4 | learning rate: 9.923E-05 | global batch size: 2048 | lm loss: 4.001330E+00 | loss scale: 32768.0 | grad norm: 18741.692 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1818/ 292968 | consumed samples: 3723264 | consumed tokens: 410320896 | elapsed time per iteration (ms): 102095.0 | learning rate: 9.929E-05 | global batch size: 2048 | lm loss: 4.021041E+00 | loss scale: 32768.0 | grad norm: 16121.321 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1819/ 292968 | consumed samples: 3725312 | consumed tokens: 410648576 | elapsed time per iteration (ms): 104457.2 | learning rate: 9.934E-05 | global batch size: 2048 | lm loss: 4.003345E+00 | loss scale: 32768.0 | grad norm: 14748.749 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1820/ 292968 | consumed samples: 3727360 | consumed tokens: 410976256 | elapsed time per iteration (ms): 103518.7 | learning rate: 9.940E-05 | global batch size: 2048 | lm loss: 3.993558E+00 | loss scale: 32768.0 | grad norm: 12476.942 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1821/ 292968 | consumed samples: 3729408 | consumed tokens: 411303936 | elapsed time per iteration (ms): 102718.4 | learning rate: 9.945E-05 | global batch size: 2048 | lm loss: 3.986332E+00 | loss scale: 32768.0 | grad norm: 15585.231 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1822/ 292968 | consumed samples: 3731456 | consumed tokens: 411631616 | elapsed time per iteration (ms): 103952.3 | learning rate: 9.951E-05 | global batch size: 2048 | lm loss: 3.997984E+00 | loss scale: 32768.0 | grad norm: 18146.375 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1823/ 292968 | consumed samples: 3733504 | consumed tokens: 411959296 | elapsed time per iteration (ms): 104675.7 | learning rate: 9.956E-05 | global batch size: 2048 | lm loss: 4.027717E+00 | loss scale: 32768.0 | grad norm: 21364.624 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1824/ 292968 | consumed samples: 3735552 | consumed tokens: 412286976 | elapsed time per iteration (ms): 106258.7 | learning rate: 9.961E-05 | global batch size: 2048 | lm loss: 4.013083E+00 | loss scale: 32768.0 | grad norm: 25364.957 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1825/ 292968 | consumed samples: 3737600 | consumed tokens: 412614656 | elapsed time per iteration (ms): 104068.6 | learning rate: 9.967E-05 | global batch size: 2048 | lm loss: 3.992085E+00 | loss scale: 32768.0 | grad norm: 23663.513 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1826/ 292968 | consumed samples: 3739648 | consumed tokens: 412942336 | elapsed time per iteration (ms): 102924.6 | learning rate: 9.972E-05 | global batch size: 2048 | lm loss: 4.007163E+00 | loss scale: 32768.0 | grad norm: 24129.785 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1827/ 292968 | consumed samples: 3741696 | consumed tokens: 413270016 | elapsed time per iteration (ms): 103482.4 | learning rate: 9.978E-05 | global batch size: 2048 | lm loss: 3.991204E+00 | loss scale: 32768.0 | grad norm: 23367.428 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1828/ 292968 | consumed samples: 3743744 | consumed tokens: 413597696 | elapsed time per iteration (ms): 105144.3 | learning rate: 9.983E-05 | global batch size: 2048 | lm loss: 4.011548E+00 | loss scale: 32768.0 | grad norm: 19931.420 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1829/ 292968 | consumed samples: 3745792 | consumed tokens: 413925376 | elapsed time per iteration (ms): 102272.3 | learning rate: 9.989E-05 | global batch size: 2048 | lm loss: 4.008599E+00 | loss scale: 32768.0 | grad norm: 23015.396 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1830/ 292968 | consumed samples: 3747840 | consumed tokens: 414253056 | elapsed time per iteration (ms): 103321.4 | learning rate: 9.994E-05 | global batch size: 2048 | lm loss: 3.977224E+00 | loss scale: 32768.0 | grad norm: 18987.146 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1831/ 292968 | consumed samples: 3749888 | consumed tokens: 414580736 | elapsed time per iteration (ms): 104215.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.007137E+00 | loss scale: 32768.0 | grad norm: 21387.069 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1832/ 292968 | consumed samples: 3751936 | consumed tokens: 414908416 | elapsed time per iteration (ms): 104504.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.007439E+00 | loss scale: 32768.0 | grad norm: 26369.559 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1833/ 292968 | consumed samples: 3753984 | consumed tokens: 415236096 | elapsed time per iteration (ms): 102758.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.989631E+00 | loss scale: 32768.0 | grad norm: 21028.505 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1834/ 292968 | consumed samples: 3756032 | consumed tokens: 415563776 | elapsed time per iteration (ms): 104795.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.969561E+00 | loss scale: 32768.0 | grad norm: 15009.612 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1835/ 292968 | consumed samples: 3758080 | consumed tokens: 415891456 | elapsed time per iteration (ms): 105188.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.003643E+00 | loss scale: 32768.0 | grad norm: 16567.730 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1836/ 292968 | consumed samples: 3760128 | consumed tokens: 416219136 | elapsed time per iteration (ms): 103721.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.037795E+00 | loss scale: 32768.0 | grad norm: 19094.075 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1837/ 292968 | consumed samples: 3762176 | consumed tokens: 416546816 | elapsed time per iteration (ms): 102213.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.991792E+00 | loss scale: 32768.0 | grad norm: 19502.392 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1838/ 292968 | consumed samples: 3764224 | consumed tokens: 416874496 | elapsed time per iteration (ms): 104563.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.013852E+00 | loss scale: 32768.0 | grad norm: 20086.677 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1839/ 292968 | consumed samples: 3766272 | consumed tokens: 417202176 | elapsed time per iteration (ms): 103569.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.998660E+00 | loss scale: 32768.0 | grad norm: 15059.153 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1840/ 292968 | consumed samples: 3768320 | consumed tokens: 417529856 | elapsed time per iteration (ms): 103521.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.001042E+00 | loss scale: 32768.0 | grad norm: 14211.409 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1841/ 292968 | consumed samples: 3770368 | consumed tokens: 417857536 | elapsed time per iteration (ms): 104170.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.012125E+00 | loss scale: 32768.0 | grad norm: 18389.771 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1842/ 292968 | consumed samples: 3772416 | consumed tokens: 418185216 | elapsed time per iteration (ms): 105869.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.986781E+00 | loss scale: 32768.0 | grad norm: 19668.908 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1843/ 292968 | consumed samples: 3774464 | consumed tokens: 418512896 | elapsed time per iteration (ms): 102886.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.982328E+00 | loss scale: 32768.0 | grad norm: 19136.149 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1844/ 292968 | consumed samples: 3776512 | consumed tokens: 418840576 | elapsed time per iteration (ms): 102642.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.999387E+00 | loss scale: 32768.0 | grad norm: 20221.566 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1845/ 292968 | consumed samples: 3778560 | consumed tokens: 419168256 | elapsed time per iteration (ms): 103349.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.002107E+00 | loss scale: 32768.0 | grad norm: 22002.635 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1846/ 292968 | consumed samples: 3780608 | consumed tokens: 419495936 | elapsed time per iteration (ms): 104694.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.007929E+00 | loss scale: 32768.0 | grad norm: 23219.445 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1847/ 292968 | consumed samples: 3782656 | consumed tokens: 419823616 | elapsed time per iteration (ms): 102716.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.981622E+00 | loss scale: 32768.0 | grad norm: 18122.042 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1848/ 292968 | consumed samples: 3784704 | consumed tokens: 420151296 | elapsed time per iteration (ms): 103687.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.999981E+00 | loss scale: 32768.0 | grad norm: 15901.681 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1849/ 292968 | consumed samples: 3786752 | consumed tokens: 420478976 | elapsed time per iteration (ms): 103768.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.017741E+00 | loss scale: 32768.0 | grad norm: 15743.800 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1850/ 292968 | consumed samples: 3788800 | consumed tokens: 420806656 | elapsed time per iteration (ms): 103181.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.000669E+00 | loss scale: 32768.0 | grad norm: 14585.118 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1851/ 292968 | consumed samples: 3790848 | consumed tokens: 421134336 | elapsed time per iteration (ms): 103704.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.998010E+00 | loss scale: 32768.0 | grad norm: 19649.401 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1852/ 292968 | consumed samples: 3792896 | consumed tokens: 421462016 | elapsed time per iteration (ms): 102416.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.974093E+00 | loss scale: 32768.0 | grad norm: 21502.088 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1853/ 292968 | consumed samples: 3794944 | consumed tokens: 421789696 | elapsed time per iteration (ms): 104810.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.989284E+00 | loss scale: 32768.0 | grad norm: 15910.019 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1854/ 292968 | consumed samples: 3796992 | consumed tokens: 422117376 | elapsed time per iteration (ms): 104900.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.003990E+00 | loss scale: 32768.0 | grad norm: 14955.049 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1855/ 292968 | consumed samples: 3799040 | consumed tokens: 422445056 | elapsed time per iteration (ms): 105003.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.985674E+00 | loss scale: 32768.0 | grad norm: 19275.223 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1856/ 292968 | consumed samples: 3801088 | consumed tokens: 422772736 | elapsed time per iteration (ms): 110393.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.030143E+00 | loss scale: 32768.0 | grad norm: 22017.659 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1857/ 292968 | consumed samples: 3803136 | consumed tokens: 423100416 | elapsed time per iteration (ms): 105250.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.004313E+00 | loss scale: 32768.0 | grad norm: 20803.880 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1858/ 292968 | consumed samples: 3805184 | consumed tokens: 423428096 | elapsed time per iteration (ms): 108057.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.996152E+00 | loss scale: 32768.0 | grad norm: 15277.064 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1859/ 292968 | consumed samples: 3807232 | consumed tokens: 423755776 | elapsed time per iteration (ms): 106003.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.003296E+00 | loss scale: 32768.0 | grad norm: 15697.932 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1860/ 292968 | consumed samples: 3809280 | consumed tokens: 424083456 | elapsed time per iteration (ms): 105314.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.003977E+00 | loss scale: 32768.0 | grad norm: 14991.132 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1861/ 292968 | consumed samples: 3811328 | consumed tokens: 424411136 | elapsed time per iteration (ms): 104906.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.962152E+00 | loss scale: 32768.0 | grad norm: 15488.594 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1862/ 292968 | consumed samples: 3813376 | consumed tokens: 424738816 | elapsed time per iteration (ms): 106770.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.957976E+00 | loss scale: 32768.0 | grad norm: 19842.969 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1863/ 292968 | consumed samples: 3815424 | consumed tokens: 425066496 | elapsed time per iteration (ms): 103967.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.970975E+00 | loss scale: 32768.0 | grad norm: 26325.707 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1864/ 292968 | consumed samples: 3817472 | consumed tokens: 425394176 | elapsed time per iteration (ms): 104099.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.959454E+00 | loss scale: 32768.0 | grad norm: 25346.771 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1865/ 292968 | consumed samples: 3819520 | consumed tokens: 425721856 | elapsed time per iteration (ms): 104626.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.981757E+00 | loss scale: 32768.0 | grad norm: 14262.590 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1866/ 292968 | consumed samples: 3821568 | consumed tokens: 426049536 | elapsed time per iteration (ms): 105326.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.974360E+00 | loss scale: 32768.0 | grad norm: 13505.152 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1867/ 292968 | consumed samples: 3823616 | consumed tokens: 426377216 | elapsed time per iteration (ms): 103743.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.968640E+00 | loss scale: 32768.0 | grad norm: 18092.374 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1868/ 292968 | consumed samples: 3825664 | consumed tokens: 426704896 | elapsed time per iteration (ms): 103889.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.990609E+00 | loss scale: 32768.0 | grad norm: 25563.167 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1869/ 292968 | consumed samples: 3827712 | consumed tokens: 427032576 | elapsed time per iteration (ms): 105379.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.001534E+00 | loss scale: 32768.0 | grad norm: 26342.114 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1870/ 292968 | consumed samples: 3829760 | consumed tokens: 427360256 | elapsed time per iteration (ms): 104129.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.016356E+00 | loss scale: 32768.0 | grad norm: 20695.859 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1871/ 292968 | consumed samples: 3831808 | consumed tokens: 427687936 | elapsed time per iteration (ms): 103554.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.980767E+00 | loss scale: 32768.0 | grad norm: 18266.523 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1872/ 292968 | consumed samples: 3833856 | consumed tokens: 428015616 | elapsed time per iteration (ms): 103328.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.966352E+00 | loss scale: 32768.0 | grad norm: 20523.363 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1873/ 292968 | consumed samples: 3835904 | consumed tokens: 428343296 | elapsed time per iteration (ms): 105100.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.975338E+00 | loss scale: 32768.0 | grad norm: 15278.499 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1874/ 292968 | consumed samples: 3837952 | consumed tokens: 428670976 | elapsed time per iteration (ms): 104468.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.978165E+00 | loss scale: 32768.0 | grad norm: 17249.432 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1875/ 292968 | consumed samples: 3840000 | consumed tokens: 428998656 | elapsed time per iteration (ms): 104659.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.002756E+00 | loss scale: 32768.0 | grad norm: 16227.485 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1876/ 292968 | consumed samples: 3842048 | consumed tokens: 429326336 | elapsed time per iteration (ms): 106522.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.016373E+00 | loss scale: 32768.0 | grad norm: 18078.560 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1877/ 292968 | consumed samples: 3844096 | consumed tokens: 429654016 | elapsed time per iteration (ms): 105534.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.978780E+00 | loss scale: 32768.0 | grad norm: 17744.305 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1878/ 292968 | consumed samples: 3846144 | consumed tokens: 429981696 | elapsed time per iteration (ms): 108817.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.977035E+00 | loss scale: 32768.0 | grad norm: 18957.105 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1879/ 292968 | consumed samples: 3848192 | consumed tokens: 430309376 | elapsed time per iteration (ms): 110899.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.984744E+00 | loss scale: 32768.0 | grad norm: 17614.107 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1880/ 292968 | consumed samples: 3850240 | consumed tokens: 430637056 | elapsed time per iteration (ms): 106879.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.999998E+00 | loss scale: 32768.0 | grad norm: 13945.817 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1881/ 292968 | consumed samples: 3852288 | consumed tokens: 430964736 | elapsed time per iteration (ms): 104266.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.997741E+00 | loss scale: 32768.0 | grad norm: 16360.110 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1882/ 292968 | consumed samples: 3854336 | consumed tokens: 431292416 | elapsed time per iteration (ms): 109896.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.973211E+00 | loss scale: 32768.0 | grad norm: 13602.619 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1883/ 292968 | consumed samples: 3856384 | consumed tokens: 431620096 | elapsed time per iteration (ms): 106835.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.979127E+00 | loss scale: 32768.0 | grad norm: 13176.365 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1884/ 292968 | consumed samples: 3858432 | consumed tokens: 431947776 | elapsed time per iteration (ms): 103387.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.988900E+00 | loss scale: 32768.0 | grad norm: 13397.455 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1885/ 292968 | consumed samples: 3860480 | consumed tokens: 432275456 | elapsed time per iteration (ms): 104310.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.954021E+00 | loss scale: 32768.0 | grad norm: 12418.918 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1886/ 292968 | consumed samples: 3862528 | consumed tokens: 432603136 | elapsed time per iteration (ms): 103098.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.973892E+00 | loss scale: 32768.0 | grad norm: 13109.810 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1887/ 292968 | consumed samples: 3864576 | consumed tokens: 432930816 | elapsed time per iteration (ms): 104190.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.996217E+00 | loss scale: 32768.0 | grad norm: 15248.506 | num zeros: 0.0 | curriculum seqlen: 160 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1888/ 292968 | consumed samples: 3866624 | consumed tokens: 433274880 | elapsed time per iteration (ms): 106975.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.014733E+00 | loss scale: 32768.0 | grad norm: 27401.333 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1889/ 292968 | consumed samples: 3868672 | consumed tokens: 433618944 | elapsed time per iteration (ms): 104275.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.211926E+00 | loss scale: 32768.0 | grad norm: 47022.444 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1890/ 292968 | consumed samples: 3870720 | consumed tokens: 433963008 | elapsed time per iteration (ms): 104767.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.152137E+00 | loss scale: 32768.0 | grad norm: 86972.643 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1891/ 292968 | consumed samples: 3872768 | consumed tokens: 434307072 | elapsed time per iteration (ms): 104340.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.148450E+00 | loss scale: 32768.0 | grad norm: 43415.458 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1892/ 292968 | consumed samples: 3874816 | consumed tokens: 434651136 | elapsed time per iteration (ms): 105395.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.167953E+00 | loss scale: 32768.0 | grad norm: 70642.549 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1893/ 292968 | consumed samples: 3876864 | consumed tokens: 434995200 | elapsed time per iteration (ms): 104133.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.050327E+00 | loss scale: 32768.0 | grad norm: 31692.073 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1894/ 292968 | consumed samples: 3878912 | consumed tokens: 435339264 | elapsed time per iteration (ms): 105356.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.063833E+00 | loss scale: 32768.0 | grad norm: 34551.797 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1895/ 292968 | consumed samples: 3880960 | consumed tokens: 435683328 | elapsed time per iteration (ms): 106318.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.105365E+00 | loss scale: 32768.0 | grad norm: 29266.025 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1896/ 292968 | consumed samples: 3883008 | consumed tokens: 436027392 | elapsed time per iteration (ms): 106038.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.083402E+00 | loss scale: 32768.0 | grad norm: 48175.604 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1897/ 292968 | consumed samples: 3885056 | consumed tokens: 436371456 | elapsed time per iteration (ms): 106005.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.066614E+00 | loss scale: 32768.0 | grad norm: 24647.768 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1898/ 292968 | consumed samples: 3887104 | consumed tokens: 436715520 | elapsed time per iteration (ms): 104472.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.054377E+00 | loss scale: 32768.0 | grad norm: 26271.428 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1899/ 292968 | consumed samples: 3889152 | consumed tokens: 437059584 | elapsed time per iteration (ms): 107533.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.027953E+00 | loss scale: 32768.0 | grad norm: 24496.772 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1900/ 292968 | consumed samples: 3891200 | consumed tokens: 437403648 | elapsed time per iteration (ms): 105148.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.060702E+00 | loss scale: 32768.0 | grad norm: 24856.430 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1901/ 292968 | consumed samples: 3893248 | consumed tokens: 437747712 | elapsed time per iteration (ms): 105232.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.998415E+00 | loss scale: 32768.0 | grad norm: 17253.064 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1902/ 292968 | consumed samples: 3895296 | consumed tokens: 438091776 | elapsed time per iteration (ms): 105374.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.003147E+00 | loss scale: 32768.0 | grad norm: 16352.381 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1903/ 292968 | consumed samples: 3897344 | consumed tokens: 438435840 | elapsed time per iteration (ms): 104894.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.035781E+00 | loss scale: 32768.0 | grad norm: 18975.280 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1904/ 292968 | consumed samples: 3899392 | consumed tokens: 438779904 | elapsed time per iteration (ms): 104730.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.971842E+00 | loss scale: 32768.0 | grad norm: 25320.959 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1905/ 292968 | consumed samples: 3901440 | consumed tokens: 439123968 | elapsed time per iteration (ms): 105836.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.982657E+00 | loss scale: 32768.0 | grad norm: 17809.913 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1906/ 292968 | consumed samples: 3903488 | consumed tokens: 439468032 | elapsed time per iteration (ms): 106493.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.971260E+00 | loss scale: 32768.0 | grad norm: 11183.800 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1907/ 292968 | consumed samples: 3905536 | consumed tokens: 439812096 | elapsed time per iteration (ms): 104920.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.959066E+00 | loss scale: 32768.0 | grad norm: 14905.932 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1908/ 292968 | consumed samples: 3907584 | consumed tokens: 440156160 | elapsed time per iteration (ms): 106431.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.016831E+00 | loss scale: 32768.0 | grad norm: 14512.253 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1909/ 292968 | consumed samples: 3909632 | consumed tokens: 440500224 | elapsed time per iteration (ms): 106285.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.968448E+00 | loss scale: 32768.0 | grad norm: 16159.687 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1910/ 292968 | consumed samples: 3911680 | consumed tokens: 440844288 | elapsed time per iteration (ms): 105398.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.945616E+00 | loss scale: 32768.0 | grad norm: 16250.641 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1911/ 292968 | consumed samples: 3913728 | consumed tokens: 441188352 | elapsed time per iteration (ms): 107226.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.961555E+00 | loss scale: 32768.0 | grad norm: 16826.998 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1912/ 292968 | consumed samples: 3915776 | consumed tokens: 441532416 | elapsed time per iteration (ms): 104944.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.941115E+00 | loss scale: 32768.0 | grad norm: 16824.593 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1913/ 292968 | consumed samples: 3917824 | consumed tokens: 441876480 | elapsed time per iteration (ms): 105594.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.965366E+00 | loss scale: 32768.0 | grad norm: 16140.226 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1914/ 292968 | consumed samples: 3919872 | consumed tokens: 442220544 | elapsed time per iteration (ms): 105552.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.967574E+00 | loss scale: 32768.0 | grad norm: 12898.281 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1915/ 292968 | consumed samples: 3921920 | consumed tokens: 442564608 | elapsed time per iteration (ms): 104963.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.956920E+00 | loss scale: 32768.0 | grad norm: 14618.533 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1916/ 292968 | consumed samples: 3923968 | consumed tokens: 442908672 | elapsed time per iteration (ms): 105727.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.942524E+00 | loss scale: 32768.0 | grad norm: 16636.229 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1917/ 292968 | consumed samples: 3926016 | consumed tokens: 443252736 | elapsed time per iteration (ms): 105059.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.983540E+00 | loss scale: 32768.0 | grad norm: 12160.386 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1918/ 292968 | consumed samples: 3928064 | consumed tokens: 443596800 | elapsed time per iteration (ms): 106772.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.946107E+00 | loss scale: 32768.0 | grad norm: 14448.935 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1919/ 292968 | consumed samples: 3930112 | consumed tokens: 443940864 | elapsed time per iteration (ms): 106157.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.936795E+00 | loss scale: 32768.0 | grad norm: 17639.457 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1920/ 292968 | consumed samples: 3932160 | consumed tokens: 444284928 | elapsed time per iteration (ms): 103958.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.945120E+00 | loss scale: 32768.0 | grad norm: 20370.927 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1921/ 292968 | consumed samples: 3934208 | consumed tokens: 444628992 | elapsed time per iteration (ms): 105688.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.944297E+00 | loss scale: 32768.0 | grad norm: 19817.179 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1922/ 292968 | consumed samples: 3936256 | consumed tokens: 444973056 | elapsed time per iteration (ms): 104602.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.959332E+00 | loss scale: 32768.0 | grad norm: 14784.450 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1923/ 292968 | consumed samples: 3938304 | consumed tokens: 445317120 | elapsed time per iteration (ms): 104358.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.948244E+00 | loss scale: 32768.0 | grad norm: 13779.814 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1924/ 292968 | consumed samples: 3940352 | consumed tokens: 445661184 | elapsed time per iteration (ms): 109721.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.958963E+00 | loss scale: 32768.0 | grad norm: 16254.486 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1925/ 292968 | consumed samples: 3942400 | consumed tokens: 446005248 | elapsed time per iteration (ms): 106171.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.943365E+00 | loss scale: 32768.0 | grad norm: 15950.526 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1926/ 292968 | consumed samples: 3944448 | consumed tokens: 446349312 | elapsed time per iteration (ms): 105304.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.949961E+00 | loss scale: 32768.0 | grad norm: 16547.359 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1927/ 292968 | consumed samples: 3946496 | consumed tokens: 446693376 | elapsed time per iteration (ms): 105011.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.946321E+00 | loss scale: 32768.0 | grad norm: 16395.710 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1928/ 292968 | consumed samples: 3948544 | consumed tokens: 447037440 | elapsed time per iteration (ms): 105566.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.956883E+00 | loss scale: 32768.0 | grad norm: 13329.427 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1929/ 292968 | consumed samples: 3950592 | consumed tokens: 447381504 | elapsed time per iteration (ms): 105651.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.938112E+00 | loss scale: 32768.0 | grad norm: 13582.341 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1930/ 292968 | consumed samples: 3952640 | consumed tokens: 447725568 | elapsed time per iteration (ms): 115146.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.959787E+00 | loss scale: 32768.0 | grad norm: 15413.704 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1931/ 292968 | consumed samples: 3954688 | consumed tokens: 448069632 | elapsed time per iteration (ms): 106408.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.959998E+00 | loss scale: 32768.0 | grad norm: 16908.405 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1932/ 292968 | consumed samples: 3956736 | consumed tokens: 448413696 | elapsed time per iteration (ms): 106034.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.968036E+00 | loss scale: 32768.0 | grad norm: 16129.539 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1933/ 292968 | consumed samples: 3958784 | consumed tokens: 448757760 | elapsed time per iteration (ms): 104967.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.941890E+00 | loss scale: 32768.0 | grad norm: 14085.742 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1934/ 292968 | consumed samples: 3960832 | consumed tokens: 449101824 | elapsed time per iteration (ms): 107633.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.959370E+00 | loss scale: 32768.0 | grad norm: 13330.396 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1935/ 292968 | consumed samples: 3962880 | consumed tokens: 449445888 | elapsed time per iteration (ms): 112586.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.951007E+00 | loss scale: 32768.0 | grad norm: 20514.051 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1936/ 292968 | consumed samples: 3964928 | consumed tokens: 449789952 | elapsed time per iteration (ms): 111987.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.946175E+00 | loss scale: 32768.0 | grad norm: 27480.587 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1937/ 292968 | consumed samples: 3966976 | consumed tokens: 450134016 | elapsed time per iteration (ms): 105112.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.946023E+00 | loss scale: 32768.0 | grad norm: 24727.537 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1938/ 292968 | consumed samples: 3969024 | consumed tokens: 450478080 | elapsed time per iteration (ms): 104963.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.925240E+00 | loss scale: 32768.0 | grad norm: 16981.206 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1939/ 292968 | consumed samples: 3971072 | consumed tokens: 450822144 | elapsed time per iteration (ms): 105575.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.959154E+00 | loss scale: 32768.0 | grad norm: 18318.796 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1940/ 292968 | consumed samples: 3973120 | consumed tokens: 451166208 | elapsed time per iteration (ms): 104649.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.939385E+00 | loss scale: 32768.0 | grad norm: 20276.304 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1941/ 292968 | consumed samples: 3975168 | consumed tokens: 451510272 | elapsed time per iteration (ms): 105884.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.935658E+00 | loss scale: 32768.0 | grad norm: 20253.194 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1942/ 292968 | consumed samples: 3977216 | consumed tokens: 451854336 | elapsed time per iteration (ms): 111170.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.905532E+00 | loss scale: 32768.0 | grad norm: 19102.238 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1943/ 292968 | consumed samples: 3979264 | consumed tokens: 452198400 | elapsed time per iteration (ms): 108227.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.937327E+00 | loss scale: 32768.0 | grad norm: 12826.017 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1944/ 292968 | consumed samples: 3981312 | consumed tokens: 452542464 | elapsed time per iteration (ms): 105372.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.935393E+00 | loss scale: 32768.0 | grad norm: 13267.474 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1945/ 292968 | consumed samples: 3983360 | consumed tokens: 452886528 | elapsed time per iteration (ms): 105463.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.920881E+00 | loss scale: 32768.0 | grad norm: 14313.848 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1946/ 292968 | consumed samples: 3985408 | consumed tokens: 453230592 | elapsed time per iteration (ms): 106434.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.969090E+00 | loss scale: 32768.0 | grad norm: 15846.360 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1947/ 292968 | consumed samples: 3987456 | consumed tokens: 453574656 | elapsed time per iteration (ms): 111036.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.967663E+00 | loss scale: 32768.0 | grad norm: 18892.731 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1948/ 292968 | consumed samples: 3989504 | consumed tokens: 453918720 | elapsed time per iteration (ms): 105010.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.961262E+00 | loss scale: 32768.0 | grad norm: 18065.501 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1949/ 292968 | consumed samples: 3991552 | consumed tokens: 454262784 | elapsed time per iteration (ms): 106625.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.953467E+00 | loss scale: 32768.0 | grad norm: 17882.776 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1950/ 292968 | consumed samples: 3993600 | consumed tokens: 454606848 | elapsed time per iteration (ms): 106623.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.935070E+00 | loss scale: 32768.0 | grad norm: 19027.421 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 1950 | lm loss value: 3.928184E+00 | lm loss PPL: 5.081463E+01 | ------------------------------------------------------------------------------------------------- - iteration 1951/ 292968 | consumed samples: 3995648 | consumed tokens: 454950912 | elapsed time per iteration (ms): 314425.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.907592E+00 | loss scale: 32768.0 | grad norm: 16309.690 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1952/ 292968 | consumed samples: 3997696 | consumed tokens: 455294976 | elapsed time per iteration (ms): 115506.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.938197E+00 | loss scale: 32768.0 | grad norm: 17344.580 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1953/ 292968 | consumed samples: 3999744 | consumed tokens: 455639040 | elapsed time per iteration (ms): 108778.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.943713E+00 | loss scale: 32768.0 | grad norm: 17118.066 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1954/ 292968 | consumed samples: 4001792 | consumed tokens: 455983104 | elapsed time per iteration (ms): 105514.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.919490E+00 | loss scale: 32768.0 | grad norm: 13833.363 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1955/ 292968 | consumed samples: 4003840 | consumed tokens: 456327168 | elapsed time per iteration (ms): 105888.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.950994E+00 | loss scale: 32768.0 | grad norm: 13127.731 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1956/ 292968 | consumed samples: 4005888 | consumed tokens: 456671232 | elapsed time per iteration (ms): 105567.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.935221E+00 | loss scale: 32768.0 | grad norm: 13543.300 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1957/ 292968 | consumed samples: 4007936 | consumed tokens: 457015296 | elapsed time per iteration (ms): 110793.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.923950E+00 | loss scale: 32768.0 | grad norm: 16199.652 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1958/ 292968 | consumed samples: 4009984 | consumed tokens: 457359360 | elapsed time per iteration (ms): 107387.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.929323E+00 | loss scale: 32768.0 | grad norm: 14293.414 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1959/ 292968 | consumed samples: 4012032 | consumed tokens: 457703424 | elapsed time per iteration (ms): 106499.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.949380E+00 | loss scale: 32768.0 | grad norm: 18907.741 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1960/ 292968 | consumed samples: 4014080 | consumed tokens: 458047488 | elapsed time per iteration (ms): 106836.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.930052E+00 | loss scale: 32768.0 | grad norm: 16436.737 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1961/ 292968 | consumed samples: 4016128 | consumed tokens: 458391552 | elapsed time per iteration (ms): 106409.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.945310E+00 | loss scale: 32768.0 | grad norm: 15376.669 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1962/ 292968 | consumed samples: 4018176 | consumed tokens: 458735616 | elapsed time per iteration (ms): 106965.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.956292E+00 | loss scale: 32768.0 | grad norm: 15115.154 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1963/ 292968 | consumed samples: 4020224 | consumed tokens: 459079680 | elapsed time per iteration (ms): 108388.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.930058E+00 | loss scale: 32768.0 | grad norm: 14066.271 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1964/ 292968 | consumed samples: 4022272 | consumed tokens: 459423744 | elapsed time per iteration (ms): 105804.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.928281E+00 | loss scale: 32768.0 | grad norm: 19340.022 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1965/ 292968 | consumed samples: 4024320 | consumed tokens: 459767808 | elapsed time per iteration (ms): 106873.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.921288E+00 | loss scale: 32768.0 | grad norm: 20923.170 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1966/ 292968 | consumed samples: 4026368 | consumed tokens: 460111872 | elapsed time per iteration (ms): 103668.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.929842E+00 | loss scale: 32768.0 | grad norm: 19834.126 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1967/ 292968 | consumed samples: 4028416 | consumed tokens: 460455936 | elapsed time per iteration (ms): 106664.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.931633E+00 | loss scale: 32768.0 | grad norm: 19386.027 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1968/ 292968 | consumed samples: 4030464 | consumed tokens: 460800000 | elapsed time per iteration (ms): 110508.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.945953E+00 | loss scale: 32768.0 | grad norm: 19908.571 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1969/ 292968 | consumed samples: 4032512 | consumed tokens: 461144064 | elapsed time per iteration (ms): 110069.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.896821E+00 | loss scale: 32768.0 | grad norm: 15035.351 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1970/ 292968 | consumed samples: 4034560 | consumed tokens: 461488128 | elapsed time per iteration (ms): 107170.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.940769E+00 | loss scale: 32768.0 | grad norm: 13950.627 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1971/ 292968 | consumed samples: 4036608 | consumed tokens: 461832192 | elapsed time per iteration (ms): 106511.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.931390E+00 | loss scale: 32768.0 | grad norm: 19245.494 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1972/ 292968 | consumed samples: 4038656 | consumed tokens: 462176256 | elapsed time per iteration (ms): 104143.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.939216E+00 | loss scale: 32768.0 | grad norm: 23053.813 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1973/ 292968 | consumed samples: 4040704 | consumed tokens: 462520320 | elapsed time per iteration (ms): 106138.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.959975E+00 | loss scale: 32768.0 | grad norm: 22524.458 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1974/ 292968 | consumed samples: 4042752 | consumed tokens: 462864384 | elapsed time per iteration (ms): 105586.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.905755E+00 | loss scale: 32768.0 | grad norm: 19440.251 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1975/ 292968 | consumed samples: 4044800 | consumed tokens: 463208448 | elapsed time per iteration (ms): 106158.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.915691E+00 | loss scale: 32768.0 | grad norm: 17649.388 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1976/ 292968 | consumed samples: 4046848 | consumed tokens: 463552512 | elapsed time per iteration (ms): 106708.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.920288E+00 | loss scale: 32768.0 | grad norm: 20503.069 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1977/ 292968 | consumed samples: 4048896 | consumed tokens: 463896576 | elapsed time per iteration (ms): 105936.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.945108E+00 | loss scale: 32768.0 | grad norm: 16839.813 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1978/ 292968 | consumed samples: 4050944 | consumed tokens: 464240640 | elapsed time per iteration (ms): 105458.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.917942E+00 | loss scale: 32768.0 | grad norm: 15257.276 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1979/ 292968 | consumed samples: 4052992 | consumed tokens: 464584704 | elapsed time per iteration (ms): 107165.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.927221E+00 | loss scale: 32768.0 | grad norm: 15093.813 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1980/ 292968 | consumed samples: 4055040 | consumed tokens: 464928768 | elapsed time per iteration (ms): 113081.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.957678E+00 | loss scale: 32768.0 | grad norm: 13839.536 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1981/ 292968 | consumed samples: 4057088 | consumed tokens: 465272832 | elapsed time per iteration (ms): 108714.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.917398E+00 | loss scale: 32768.0 | grad norm: 14074.082 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1982/ 292968 | consumed samples: 4059136 | consumed tokens: 465616896 | elapsed time per iteration (ms): 107604.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.925085E+00 | loss scale: 32768.0 | grad norm: 13534.880 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1983/ 292968 | consumed samples: 4061184 | consumed tokens: 465960960 | elapsed time per iteration (ms): 112383.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.944923E+00 | loss scale: 32768.0 | grad norm: 13209.445 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1984/ 292968 | consumed samples: 4063232 | consumed tokens: 466305024 | elapsed time per iteration (ms): 112954.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.918631E+00 | loss scale: 32768.0 | grad norm: 19787.184 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1985/ 292968 | consumed samples: 4065280 | consumed tokens: 466649088 | elapsed time per iteration (ms): 111797.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.935518E+00 | loss scale: 32768.0 | grad norm: 17837.294 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1986/ 292968 | consumed samples: 4067328 | consumed tokens: 466993152 | elapsed time per iteration (ms): 110679.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.927701E+00 | loss scale: 32768.0 | grad norm: 24145.327 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1987/ 292968 | consumed samples: 4069376 | consumed tokens: 467337216 | elapsed time per iteration (ms): 106586.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.924149E+00 | loss scale: 32768.0 | grad norm: 19059.242 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1988/ 292968 | consumed samples: 4071424 | consumed tokens: 467681280 | elapsed time per iteration (ms): 104497.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.911625E+00 | loss scale: 32768.0 | grad norm: 15092.949 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1989/ 292968 | consumed samples: 4073472 | consumed tokens: 468025344 | elapsed time per iteration (ms): 104962.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.930661E+00 | loss scale: 32768.0 | grad norm: 19898.790 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1990/ 292968 | consumed samples: 4075520 | consumed tokens: 468369408 | elapsed time per iteration (ms): 104607.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.931398E+00 | loss scale: 32768.0 | grad norm: 18910.425 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1991/ 292968 | consumed samples: 4077568 | consumed tokens: 468713472 | elapsed time per iteration (ms): 103902.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.927662E+00 | loss scale: 32768.0 | grad norm: 16632.425 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1992/ 292968 | consumed samples: 4079616 | consumed tokens: 469057536 | elapsed time per iteration (ms): 106519.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.915715E+00 | loss scale: 32768.0 | grad norm: 13302.984 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1993/ 292968 | consumed samples: 4081664 | consumed tokens: 469401600 | elapsed time per iteration (ms): 105643.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.921783E+00 | loss scale: 32768.0 | grad norm: 16160.708 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1994/ 292968 | consumed samples: 4083712 | consumed tokens: 469745664 | elapsed time per iteration (ms): 104271.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.939743E+00 | loss scale: 32768.0 | grad norm: 19586.680 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1995/ 292968 | consumed samples: 4085760 | consumed tokens: 470089728 | elapsed time per iteration (ms): 105935.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.918940E+00 | loss scale: 32768.0 | grad norm: 18793.983 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1996/ 292968 | consumed samples: 4087808 | consumed tokens: 470433792 | elapsed time per iteration (ms): 105026.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.930414E+00 | loss scale: 32768.0 | grad norm: 16737.588 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1997/ 292968 | consumed samples: 4089856 | consumed tokens: 470777856 | elapsed time per iteration (ms): 104382.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.952893E+00 | loss scale: 32768.0 | grad norm: 13563.057 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1998/ 292968 | consumed samples: 4091904 | consumed tokens: 471121920 | elapsed time per iteration (ms): 106021.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.901303E+00 | loss scale: 32768.0 | grad norm: 15104.265 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 1999/ 292968 | consumed samples: 4093952 | consumed tokens: 471465984 | elapsed time per iteration (ms): 105576.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.945539E+00 | loss scale: 32768.0 | grad norm: 22390.801 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -[2021-10-27 07:53:42,718] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[9.99992373584452e-05, 9.99992373584452e-05], mom=[(0.9, 0.95), (0.9, 0.95)] -steps: 2000 loss: 3.9148 iter time (s): 0.053 samples/sec: 38744.481 - iteration 2000/ 292968 | consumed samples: 4096000 | consumed tokens: 471810048 | elapsed time per iteration (ms): 105723.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.914763E+00 | loss scale: 65536.0 | grad norm: 19113.174 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2001/ 292968 | consumed samples: 4098048 | consumed tokens: 472154112 | elapsed time per iteration (ms): 105032.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.919772E+00 | loss scale: 65536.0 | grad norm: 45665.550 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2002/ 292968 | consumed samples: 4100096 | consumed tokens: 472498176 | elapsed time per iteration (ms): 104883.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.920336E+00 | loss scale: 65536.0 | grad norm: 80367.931 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2003/ 292968 | consumed samples: 4102144 | consumed tokens: 472842240 | elapsed time per iteration (ms): 106158.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.942242E+00 | loss scale: 65536.0 | grad norm: 46148.047 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2004/ 292968 | consumed samples: 4104192 | consumed tokens: 473186304 | elapsed time per iteration (ms): 107745.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.898877E+00 | loss scale: 65536.0 | grad norm: 36023.288 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2005/ 292968 | consumed samples: 4106240 | consumed tokens: 473530368 | elapsed time per iteration (ms): 104817.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.943701E+00 | loss scale: 65536.0 | grad norm: 38876.683 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2006/ 292968 | consumed samples: 4108288 | consumed tokens: 473874432 | elapsed time per iteration (ms): 106505.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.931247E+00 | loss scale: 65536.0 | grad norm: 33470.765 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2007/ 292968 | consumed samples: 4110336 | consumed tokens: 474218496 | elapsed time per iteration (ms): 106419.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.964560E+00 | loss scale: 65536.0 | grad norm: 29687.656 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2008/ 292968 | consumed samples: 4112384 | consumed tokens: 474562560 | elapsed time per iteration (ms): 107192.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.907559E+00 | loss scale: 65536.0 | grad norm: 39289.522 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2009/ 292968 | consumed samples: 4114432 | consumed tokens: 474906624 | elapsed time per iteration (ms): 104955.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.923905E+00 | loss scale: 65536.0 | grad norm: 35524.350 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2010/ 292968 | consumed samples: 4116480 | consumed tokens: 475250688 | elapsed time per iteration (ms): 108872.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.916351E+00 | loss scale: 65536.0 | grad norm: 24785.216 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2011/ 292968 | consumed samples: 4118528 | consumed tokens: 475594752 | elapsed time per iteration (ms): 104228.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.904333E+00 | loss scale: 65536.0 | grad norm: 39590.400 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2012/ 292968 | consumed samples: 4120576 | consumed tokens: 475938816 | elapsed time per iteration (ms): 104741.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.935276E+00 | loss scale: 65536.0 | grad norm: 37903.947 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2013/ 292968 | consumed samples: 4122624 | consumed tokens: 476282880 | elapsed time per iteration (ms): 106897.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.902785E+00 | loss scale: 65536.0 | grad norm: 20403.308 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2014/ 292968 | consumed samples: 4124672 | consumed tokens: 476626944 | elapsed time per iteration (ms): 105250.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.913821E+00 | loss scale: 65536.0 | grad norm: 23439.287 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2015/ 292968 | consumed samples: 4126720 | consumed tokens: 476971008 | elapsed time per iteration (ms): 107340.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.909118E+00 | loss scale: 65536.0 | grad norm: 25155.083 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2016/ 292968 | consumed samples: 4128768 | consumed tokens: 477315072 | elapsed time per iteration (ms): 105778.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.910501E+00 | loss scale: 65536.0 | grad norm: 23160.791 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2017/ 292968 | consumed samples: 4130816 | consumed tokens: 477659136 | elapsed time per iteration (ms): 104080.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.925369E+00 | loss scale: 65536.0 | grad norm: 24046.482 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2018/ 292968 | consumed samples: 4132864 | consumed tokens: 478003200 | elapsed time per iteration (ms): 107274.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.888488E+00 | loss scale: 65536.0 | grad norm: 25188.690 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2019/ 292968 | consumed samples: 4134912 | consumed tokens: 478347264 | elapsed time per iteration (ms): 105437.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.891493E+00 | loss scale: 65536.0 | grad norm: 23830.177 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2020/ 292968 | consumed samples: 4136960 | consumed tokens: 478691328 | elapsed time per iteration (ms): 103301.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.916592E+00 | loss scale: 65536.0 | grad norm: 32223.798 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2021/ 292968 | consumed samples: 4139008 | consumed tokens: 479035392 | elapsed time per iteration (ms): 108652.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.889750E+00 | loss scale: 65536.0 | grad norm: 40872.900 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2022/ 292968 | consumed samples: 4141056 | consumed tokens: 479379456 | elapsed time per iteration (ms): 107504.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.892524E+00 | loss scale: 65536.0 | grad norm: 28959.533 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2023/ 292968 | consumed samples: 4143104 | consumed tokens: 479723520 | elapsed time per iteration (ms): 105937.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.947634E+00 | loss scale: 65536.0 | grad norm: 28395.060 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2024/ 292968 | consumed samples: 4145152 | consumed tokens: 480067584 | elapsed time per iteration (ms): 105944.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.909609E+00 | loss scale: 65536.0 | grad norm: 25389.003 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2025/ 292968 | consumed samples: 4147200 | consumed tokens: 480411648 | elapsed time per iteration (ms): 105717.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.912829E+00 | loss scale: 65536.0 | grad norm: 23156.778 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2026/ 292968 | consumed samples: 4149248 | consumed tokens: 480755712 | elapsed time per iteration (ms): 106132.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.903689E+00 | loss scale: 65536.0 | grad norm: 32742.610 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2027/ 292968 | consumed samples: 4151296 | consumed tokens: 481099776 | elapsed time per iteration (ms): 107458.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.886243E+00 | loss scale: 65536.0 | grad norm: 31171.176 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2028/ 292968 | consumed samples: 4153344 | consumed tokens: 481443840 | elapsed time per iteration (ms): 105627.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.913125E+00 | loss scale: 65536.0 | grad norm: 25790.752 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -saving checkpoint at iteration 2028 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-27 08:43:15,456] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/mp_rank_00_model_states.pt -[2021-10-27 08:43:15,754] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/mp_rank_01_model_states.pt -[2021-10-27 08:43:28,422] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-27 08:43:28,476] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-27 08:43:28,524] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-27 08:43:28,581] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-27 08:43:28,628] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-27 08:43:28,687] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-27 08:43:28,694] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-27 08:43:28,711] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-27 08:43:28,745] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-27 08:43:28,757] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-27 08:43:28,771] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-27 08:43:28,789] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-27 08:43:28,791] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-27 08:43:28,843] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-27 08:43:28,844] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-27 08:43:28,880] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-27 08:43:28,896] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-27 08:43:28,900] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-27 08:43:28,906] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-27 08:43:28,908] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-27 08:43:28,926] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-27 08:43:28,945] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-27 08:43:28,959] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-27 08:43:28,988] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-27 08:43:29,003] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-27 08:43:29,009] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-27 08:43:29,070] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-27 08:43:29,078] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-27 08:43:29,118] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-27 08:43:29,264] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-27 08:43:29,345] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-27 08:43:29,390] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-27 08:43:29,475] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-27 08:43:29,546] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-27 08:43:29,610] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-27 08:43:29,641] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-27 08:43:29,655] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-27 08:43:29,679] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-27 08:43:29,686] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-27 08:43:29,714] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-27 08:43:29,718] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-27 08:43:29,719] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-27 08:43:29,731] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-27 08:43:29,745] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-27 08:43:29,802] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-27 08:43:29,802] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-27 08:43:29,845] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-27 08:43:29,872] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-27 08:43:29,881] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-27 08:43:29,885] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-27 08:43:29,900] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-27 08:43:29,929] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-27 08:43:29,931] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-27 08:43:29,934] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-27 08:43:29,944] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-27 08:43:29,944] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-27 08:43:29,969] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-27 08:43:29,972] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-27 08:43:30,003] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-27 08:43:30,012] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-27 08:43:30,018] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-27 08:43:30,025] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-27 08:43:30,027] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-27 08:43:30,031] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-27 08:43:30,032] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-27 08:43:30,036] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-27 08:43:30,039] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-27 08:43:30,087] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-27 08:43:30,108] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-27 08:43:30,110] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-27 08:43:30,138] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-27 08:43:30,141] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-27 08:43:30,158] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-27 08:43:30,170] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-27 08:43:30,171] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-27 08:43:30,176] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-27 08:43:30,196] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-27 08:43:30,210] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-27 08:43:30,237] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-27 08:43:30,242] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-27 08:43:30,269] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-27 08:43:30,282] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-27 08:43:30,286] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-27 08:43:30,301] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-27 08:43:30,339] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-27 08:43:30,372] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-27 08:43:30,401] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-27 08:43:30,428] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-27 08:43:30,475] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-27 08:43:30,480] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-27 08:43:30,510] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-27 08:43:30,536] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-27 08:43:30,561] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-27 08:43:30,580] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-27 08:43:30,583] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-27 08:43:30,608] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-27 08:43:30,676] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-27 08:43:30,714] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-27 08:43:30,716] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-27 08:43:30,739] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-27 08:43:30,745] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-27 08:43:30,781] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-27 08:43:30,815] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-27 08:43:30,825] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-27 08:43:30,868] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-27 08:43:30,869] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-27 08:43:30,875] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-27 08:43:30,963] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-27 08:43:30,964] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-27 08:43:30,979] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-27 08:43:30,991] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-27 08:43:31,100] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-27 08:43:31,240] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-27 08:43:31,250] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-27 08:43:31,381] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-27 08:43:31,416] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-27 08:43:32,638] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-27 08:43:32,703] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-27 08:43:32,817] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-27 08:43:33,045] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-27 08:43:33,181] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-27 08:43:34,307] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-27 08:43:34,646] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-27 08:43:36,665] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-27 08:43:37,955] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-27 08:43:37,963] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-27 08:43:38,123] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-27 08:43:38,316] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2028/zero_pp_rank_0_mp_rank_17_optim_states.pt - successfully saved checkpoint at iteration 2028 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 25851.77 -[exiting program after 1190.4112010161082 minutes] datetime: 2021-10-27 08:43:38 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------JIT compiled ops requires ninja -JIT compiled ops requires ninja - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name -op nameop name op name................................................ ................installedinstalledinstalled ..installed.... compatiblecompatiblecompatible.. - - - --------------------------------------------------compatible---------------------------------------------------------------------------------------------------- - - - --------------------------------------------------- -cpu_adamcpu_adamcpu_adam cpu_adam .............................. ............... ...............[NO][NO] .......[NO] [NO]....... [OKAY] ....... -.......[OKAY] -[OKAY][OKAY] - -fused_adam ............. [NO]fused_adam .......fused_adam.............fused_adam [OKAY].............[NO]............. -[NO] .......[NO] ....... [OKAY]fused_lamb ....... - [OKAY] .............[OKAY] - -[NO]fused_lamb fused_lamb....................fused_lamb ..........................[OKAY][NO] -[NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn sparse_attn............transformer sparse_attn ............[NO]........................ .......[NO][NO] [NO] ....... [OKAY]....... .......[OKAY] - - [OKAY][OKAY] - -transformertransformer transformerstochastic_transformer........................ [NO]............. [NO] [NO]....... [NO] ....... [OKAY]....... -.......[OKAY] -[OKAY][OKAY] - -stochastic_transformer stochastic_transformer. stochastic_transformer[NO] . ........[NO] [OKAY][NO]....... - .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op name ................op name installed .................. installedcompatible -..-------------------------------------------------- - compatible --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam ....... ...............[OKAY] -[NO] ....... [OKAY] -fused_adam ............. [NO] ....... fused_adam[OKAY] -............. [NO] ....... fused_lamb[OKAY] -............. [NO] .......fused_lamb [OKAY] -............. [NO] ....... [OKAY] -sparse_attn ............ [NO]sparse_attn ....... ............[OKAY] -[NO] .......transformer [OKAY]............ - [NO] .......transformer [OKAY] -............ [NO] ....... stochastic_transformer[OKAY] -. [NO]stochastic_transformer ....... [OKAY]. - [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -ninjastochastic_transformer .................. .[OKAY] -[NO]-------------------------------------------------- -.......op name [OKAY]................ - installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] -utils ..................quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] -quantizer-------------------------------------------------- -.............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inferenceutils .................... [NO][NO] .............. [OKAY][OKAY] - -quantizer utils.............. ..................[NO] [NO]....... .......[OKAY] -[OKAY] ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja ---------------------------------------------------JIT compiled ops requires ninja --------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninja .................. .................................... [OKAY][OKAY] -[OKAY] - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op nameop name op name ................ ................ninja ................installedinstalled installed.................. .. .. compatible..[OKAY] -compatible - ---------------------------------------------------compatible --------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -op name ................ installed cpu_adam..cpu_adam cpu_adam...............compatible............... - ...............[NO][NO]-------------------------------------------------- [NO] -.............. .......[OKAY][OKAY] - -[OKAY] -cpu_adam ............... [NO] ....... [OKAY] -fused_adamfused_adam fused_adam.......................... [NO] [NO].................... fused_adam.......[NO] [OKAY] ....... ............. -[OKAY] -[OKAY][NO] -fused_lamb.......fused_lamb fused_lamb.............[OKAY] ............. ............. -[NO] [NO] [NO] ....... fused_lamb ....... ....... [OKAY]............. - [OKAY][OKAY][NO] - - ....... [OKAY] -sparse_attn sparse_attnsparse_attn............ ............sparse_attn[NO] ............ ................... [NO] [NO][NO] [OKAY] -..................... [OKAY][OKAY][OKAY]transformer - - - ............transformertransformer transformer [NO]............ ...............................[NO] [NO][NO][OKAY]....... - .......[OKAY]....... - [OKAY][OKAY] -stochastic_transformer - stochastic_transformerstochastic_transformer .stochastic_transformer . .[NO] [NO]........[NO] .......[NO] [OKAY].......[OKAY] - -.......[OKAY] -[OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] - -[OKAY] --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -op name--------------------------------------------------op name - ................op name................ op nameinstalled ................installed.................. .. installedcompatible installedcompatible - ---------------------------------------------------.... - -------------------------------------------------- -compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam ...................... [OKAY][NO]cpu_adam - cpu_adam...................... ...............[OKAY][NO] - [NO]....... fused_adam[OKAY]....... - ............. [OKAY][NO] - fused_adam....... .............[OKAY] -[NO]fused_adam .......fused_lamb............. fused_adam [OKAY]............. [NO] -............. [NO].......[NO] fused_lamb ....... ....... [OKAY]............. -[OKAY] - [OKAY][NO]fused_lamb - .................... fused_lamb [NO] [OKAY] ............. -....... sparse_attn [NO] ............[OKAY]....... -[NO] [OKAY]....... - sparse_attn[OKAY] -............ [NO]transformer ................... sparse_attn[OKAY][NO] - ...................sparse_attn transformer [NO] [OKAY] -............................... [NO][NO][OKAY] stochastic_transformer - .............. transformer. [OKAY] [OKAY] [NO] - -................... transformer [NO]stochastic_transformer [OKAY] ................... .[NO][OKAY] - [NO] .............. stochastic_transformer -[OKAY] -[OKAY] -. stochastic_transformer[NO] ....... .[OKAY] -[NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja .................................... .................. ..................[OKAY][OKAY] - -[OKAY][OKAY]-------------------------------------------------- --------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op nameop name - - op name................op name................ ................ installedinstalled................ ..installed..installed compatible compatible.. -.. - --------------------------------------------------compatible-------------------------------------------------- -compatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam ...............cpu_adamcpu_adam............... ............... [NO][NO] ............... [NO].............. [NO] ....... [OKAY][OKAY] - -.......[OKAY] -[OKAY] -fused_adamfused_adam fused_adam.............fused_adam ............. [NO]............. ............. [NO] .......[NO]....... [NO] [OKAY] - .......[OKAY]....... - [OKAY]fused_lamb[OKAY] - fused_lamb -............. .............fused_lamb[NO] fused_lamb [NO] ................................. .......[NO] [NO][OKAY] [OKAY] - ....... -....... [OKAY][OKAY] - -sparse_attn sparse_attn............ [NO]............sparse_attnsparse_attn ...................[NO]............ [OKAY].......[NO][NO] - [OKAY].............. - transformer transformer[OKAY] [OKAY] -........................ - [NO]transformer[NO] transformer .............. ............ [OKAY] ............[NO] -[OKAY] -[NO].......stochastic_transformer stochastic_transformer.[OKAY] ....... -[NO]. [OKAY].......stochastic_transformer[NO] - [OKAY] -.......stochastic_transformer .[OKAY] -.[NO] [NO]....... .......[OKAY] -[OKAY] -ninjaninjaninjaninja ...................................................... [OKAY]..................[OKAY] -[OKAY] --------------------------------------------------- -[OKAY] -----------------------------------------------------------------------------------------------------op name - - - op name................-------------------------------------------------- op name installed -................................ op nameinstalled..installed ................ compatibleinstalled.. - .. --------------------------------------------------compatible .. - -compatible -------------------------------------------------- -compatible --------------------------------------------------- - ---------------------------------------------------cpu_adam - ............... [NO] .......cpu_adam [OKAY]cpu_adam............... - cpu_adam ............... [NO]...............[NO] .......[NO]....... [OKAY]fused_adam....... - [OKAY] [OKAY] - -............. [NO] ....... [OKAY] -fused_adam ............. fused_lambfused_adamfused_adam[NO] .............................................. [NO][NO][NO][OKAY] ....... -....... ....... [OKAY] -fused_lamb[OKAY][OKAY] - -.............fused_lamb [NO]fused_lamb............. .......[NO]............. [OKAY].......[NO] - [OKAY]sparse_attn....... - ............[OKAY] -[NO] ....... [OKAY] -transformersparse_attn ........................ sparse_attn [NO] [NO]sparse_attn ............ ..........................[NO] [OKAY][NO] [OKAY]....... - - .......[OKAY]stochastic_transformer -transformer[OKAY] - transformer. ............transformer[NO]............ [NO]............[NO]....... ..............[NO] [OKAY][OKAY] -[OKAY] -....... - [OKAY]stochastic_transformer - stochastic_transformer .stochastic_transformer. [NO][NO]. ..............[NO] [OKAY][OKAY]....... - - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op name -op name op name................ op name installed................................ ..................installedinstalled compatible..installed - .. compatible-------------------------------------------------- -.. -compatible -------------------------------------------------- -compatible --------------------------------------------------- - -cpu_adam-------------------------------------------------- -............... cpu_adam[NO] cpu_adam ............... ....... [NO]cpu_adam ...............[OKAY] ...................... - [NO][OKAY] [NO] -....... .......[OKAY] -[OKAY] -fused_adam ............. [NO] ....... fused_adam[OKAY] -fused_adamfused_adam............. .............fused_lamb.............[NO] .......[NO].............[NO] [OKAY] .......[NO] -....... .......[OKAY][OKAY] fused_lamb - -[OKAY] -............. [NO]fused_lamb fused_lamb ....... ............. ............. [OKAY] [NO] - [NO]....... sparse_attn .......[OKAY]............ - [OKAY][NO] - ....... [OKAY] -sparse_attntransformer ........................ [NO][NO] .......sparse_attn....... sparse_attn [OKAY]............ -............[OKAY] -[NO]transformer[NO] ....... ............ stochastic_transformer....... [OKAY][NO] . - [OKAY][NO]....... - transformer.......transformer ............ ............[OKAY] [OKAY] -[NO] -[NO] .............. [OKAY][OKAY]stochastic_transformer - - stochastic_transformer. stochastic_transformer[NO] . ........[NO] [OKAY][NO]....... - .......[OKAY] -[OKAY] -ninjaninjaninja ninja.................. .................. .................. ..................[OKAY][OKAY] - -[OKAY][OKAY]-------------------------------------------------- --------------------------------------------------- - - -op name---------------------------------------------------------------------------------------------------- -op name - op name................................op name installed................................installed installed..installed.. compatiblecompatible.... - - ----------------------------------------------------------------------------------------------------compatiblecompatible - - - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... ...............[NO]cpu_adam cpu_adam[NO] ...................... ....... ...............[OKAY][NO] -[OKAY] [NO] -....... .......[OKAY] -[OKAY] -fused_adam fused_adam............. .............[NO] fused_adam[NO].......fused_adam ..........................[OKAY]....... - [NO][NO][OKAY] - fused_lamb ....... ....................[OKAY]fused_lamb - [NO][OKAY]............. -fused_lamb ....... [NO].............fused_lamb ....... [OKAY][NO]............. - [OKAY] ....... -[NO] [OKAY]....... - [OKAY] -sparse_attn sparse_attn............ [NO]............ sparse_attn.......sparse_attn[NO] ............ [OKAY]............ ....... - [NO] [NO]transformer [OKAY] -.......................... transformer[OKAY] [NO] -[OKAY]................... - transformer[OKAY][NO]transformer - ............................... [NO]stochastic_transformer[OKAY][NO] -....... . ....... [NO][OKAY]stochastic_transformer[OKAY] - ....... - .stochastic_transformer[OKAY] -stochastic_transformer[NO] . ........[NO] [OKAY] [NO] -....... .......[OKAY] [OKAY] - ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninja ninja .................. .................. [OKAY].................................... - [OKAY]--------------------------------------------------[OKAY][OKAY] - - - ---------------------------------------------------op name-------------------------------------------------- - -------------------------------------------------- -................op name - op name installed op name................ ................ ................ ..installed installed installed compatible.. - ....compatible-------------------------------------------------- - -compatible--------------------------------------------------compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adam ...............cpu_adam [NO]...............cpu_adam cpu_adam[NO]....... ...................... [OKAY]............... -[NO][OKAY] -[NO]....... .......[OKAY] -[OKAY] -fused_adam .............fused_adam [NO] fused_adam....................fused_adam [OKAY]..........................[NO] - [NO].......[NO] [OKAY]fused_lamb -....... ....................fused_lamb [OKAY] [OKAY] - -[NO].............fused_lamb [NO] .......fused_lamb ............. ....... [OKAY].............[NO] - [OKAY] [NO] -....... .......[OKAY] -[OKAY] -sparse_attnsparse_attn ........................ [NO]sparse_attn[NO]sparse_attn ................... ...................[OKAY] -[NO][NO][OKAY] -transformer ....... transformer................... ............ [NO][OKAY] [NO][OKAY]....... - -[OKAY]....... - transformertransformer[OKAY] -............ stochastic_transformer ............ [NO]stochastic_transformer[NO] ........ . .......[NO][NO][OKAY] ....... -.......[OKAY] -[OKAY][OKAY]stochastic_transformer - - stochastic_transformer . [NO]. .......[NO] [OKAY]....... - [OKAY] -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY] - -[OKAY][OKAY]-------------------------------------------------- - --------------------------------------------------- - -----------------------------------------------------------------------------------------------------op nameop name - - op name................................op name ................installedinstalled................ ..installed..installed ....compatiblecompatible - -compatible -------------------------------------------------- -compatible-------------------------------------------------- --------------------------------------------------- - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam............... cpu_adam.............................. [NO] ...............[NO] [NO] ....... .......[NO].......[OKAY] - [OKAY].......[OKAY] - -[OKAY] -fused_adam .............fused_adam [NO]fused_adamfused_adam ............. ....... ............. .............[NO] [OKAY] -[NO][NO]....... ..............fused_lamb [OKAY] [OKAY]............. -[OKAY] - -fused_lamb[NO] fused_lamb....................fused_lamb .............[OKAY][NO]............. - [NO].......[NO] .......[OKAY]....... -[OKAY] -[OKAY] -sparse_attn ............ [NO] .......sparse_attn [OKAY] sparse_attn -sparse_attn............ ............transformer............[NO] [NO]............ .......[NO].......[NO] ....... [OKAY] .......[OKAY] -[OKAY] - -[OKAY]transformer - transformertransformer............ stochastic_transformer........................ [NO] . [NO][NO] .............. [NO] ....... [OKAY][OKAY] -....... - [OKAY][OKAY] - -stochastic_transformer stochastic_transformer. stochastic_transformer[NO]. .......[NO]. [OKAY]....... -[NO] [OKAY]....... - [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam ............... ...............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_adam fused_adam............. .............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_lamb fused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attn sparse_attn............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninja ninja .................. .................. [OKAY].................. -.................. [OKAY]-------------------------------------------------- [OKAY] - -[OKAY] ---------------------------------------------------op name --------------------------------------------------- - -................op name--------------------------------------------------op name -installed ................op name ..................installed ................compatible..installed - installedcompatible--------------------------------------------------.. - - --------------------------------------------------compatible.. - - --------------------------------------------------compatible - ---------------------------------------------------cpu_adam - ...............cpu_adam [NO]............... cpu_adam.......[NO]cpu_adam ....... .............................. [OKAY][OKAY] - -[NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... fused_adam [NO]fused_adam ....................[NO] [NO] ............. [OKAY]....... ....... - [OKAY][NO][OKAY] - - fused_lamb....... .............fused_lambfused_lamb [OKAY] -............. [NO] ............. fused_lamb[NO] ....... [NO] ....... ............. [OKAY] -[OKAY].......[NO] - .......[OKAY] -[OKAY] -sparse_attn ............sparse_attn [NO]............ .......[NO]sparse_attn sparse_attn [OKAY] ................... - ............[OKAY] transformer -[NO] [NO] ............ .......transformer.......[NO] [OKAY]............[OKAY]....... - [NO] -[OKAY] -.......transformer transformer [OKAY] stochastic_transformer............ - ............. [NO] [NO] stochastic_transformer [NO] .............. [OKAY] ........ -[OKAY][OKAY] -[NO] -stochastic_transformer ....... stochastic_transformer[OKAY]. - [NO]. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninja ninja .................. .................................... .................. [OKAY][OKAY][OKAY] - - -[OKAY]---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op name ---------------------------------------------------op name op name - ................................op name ................ installed installedinstalled.. ................ ..compatibleinstalled - --------------------------------------------------compatible.... - - compatible--------------------------------------------------compatible - - --------------------------------------------------- -cpu_adam-------------------------------------------------- -............... [NO]cpu_adam ....... cpu_adam ...............[OKAY]............... -[NO]cpu_adam [NO]...................... .......[OKAY] -[OKAY] -[NO]fused_adam .................... [OKAY][NO] - ....... [OKAY] -fused_adamfused_adam .............fused_lamb............. [NO].............[NO] [NO].......fused_adam ........................... [OKAY][OKAY] - [OKAY] -[NO] - .......fused_lamb [OKAY] fused_lamb -............. .............[NO] fused_lamb[NO] sparse_attn ....................................... [OKAY][NO][OKAY] -[NO] ....... -.......[OKAY] -[OKAY] -transformer ............ [NO] sparse_attn....... [OKAY] -............sparse_attn [NO]............ .......[NO]stochastic_transformer [OKAY].......sparse_attn - . [OKAY]............[NO]transformer - ....... transformer............[OKAY][NO] - .......[NO]............ [OKAY] ....... -[NO] [OKAY].......transformer - [OKAY] - stochastic_transformer............ stochastic_transformer . [NO] .[NO]....... [NO].......[OKAY] -.......[OKAY] [OKAY] - -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report--------------------------------------------------JIT compiled ops requires ninja - - - -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY] -[OKAY][OKAY][OKAY] --------------------------------------------------- - - --------------------------------------------------- ---------------------------------------------------op nameop name-------------------------------------------------- - -op name................................op name ................................installedinstalled installed....installed compatible compatible.. - -..-------------------------------------------------- -------------------------------------------------- - compatible -compatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... cpu_adam[NO] ...................... cpu_adam[NO]cpu_adam[OKAY] -..................................... [OKAY][NO][NO] - .............. [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY]fused_adam - .............fused_adam fused_adam fused_lamb[NO]............. ....... .......................... [NO] [OKAY][NO] [NO] - .............. fused_lamb[OKAY]....... [OKAY]............. - - [OKAY][NO] - fused_lamb....... .............fused_lamb[OKAY] -[NO]............. .......[NO]sparse_attn [OKAY]................... - [NO][OKAY] ....... - [OKAY] -sparse_attn transformer............ ............[NO] [NO]sparse_attn....... ...................[OKAY]sparse_attn - [OKAY]............[NO] -transformer [NO]...................stochastic_transformer [NO].......[OKAY] ........ -[OKAY] -[NO][OKAY]transformer - transformer ....... ............ ............stochastic_transformer [OKAY] [NO] -[NO] . ....... ....... [NO] [OKAY] [OKAY]....... - - [OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninjaninja ninja ninja.................................... ..................[OKAY].................. [OKAY] [OKAY] - -[OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op nameop name -op name ................op name ................ ................ ................ installedinstalled installed installed.. .. .. ..compatible - compatible -------------------------------------------------- -compatiblecompatible --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam cpu_adam .......cpu_adam ............... ...............[OKAY]............... - [NO][NO] ....... ....... [NO] [OKAY] [OKAY] -....... - [OKAY]fused_adam - ............. [NO] ....... [OKAY] -fused_adam fused_adam............. fused_lamb.............[NO] .............[NO]fused_adam....... [NO].......[OKAY] - .......[OKAY]............. fused_lamb -[OKAY] -.............[NO]fused_lamb .......[NO]............. [OKAY] ....... -[NO] [OKAY]....... - fused_lambsparse_attn[OKAY] -............ [NO] .................... [NO][OKAY] -....... [OKAY]transformer - sparse_attn............ [NO] ............sparse_attn....... [NO][OKAY]............ - .......[NO] [OKAY].......stochastic_transformer - [OKAY]sparse_attntransformer. - ............ [NO]transformer............ .......[NO] ............ [OKAY].......[NO] - [NO][OKAY] -....... .......[OKAY] stochastic_transformer -[OKAY] -. stochastic_transformer[NO] .......transformer . [OKAY]............ -[NO] ....... [OKAY] -[NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ------------------------------------------------------------------------------------------------------------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja .................. ......................................................[OKAY] - [OKAY][OKAY] -[OKAY]-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------op nameop name -op name op name................................................ ................installed installedinstalled installed ........ compatible -compatiblecompatiblecompatible - --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -cpu_adamcpu_adamcpu_adam cpu_adam ............... .............................. ............... [NO][NO][NO] [NO] ....... .............. ....... [OKAY][OKAY][OKAY] - - -[OKAY] -fused_adam .............fused_adam fused_adam [NO]............. fused_adam ....... .............[NO] ............. [OKAY] [NO]....... - [NO] .......[OKAY].......fused_lamb - [OKAY][OKAY]............. - - fused_lamb[NO] fused_lambfused_lamb.................... [NO] .......................... [OKAY] ....... -[NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -sparse_attn ............ [NO] sparse_attn....... ............ sparse_attn sparse_attn[OKAY][NO] -............................... [OKAY][NO]transformer[NO] - .......................... transformer [OKAY] -[OKAY]............[NO] - transformer[NO] transformer.............. ............ ............[OKAY][OKAY][NO] - -[NO]....... .......[OKAY]stochastic_transformer -stochastic_transformer [OKAY] -.stochastic_transformer. [NO][NO]stochastic_transformer . ..............[NO]. [OKAY][OKAY][NO]....... - - [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report--------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... [OKAY][OKAY].................. -.................. - -------------------------------------------------- -[OKAY]--------------------------------------------------[OKAY] -op name - --------------------------------------------------- op name-------------------------------------------------- - ................................ -op name op name installedinstalled ................ ................ .. installedinstalled .. compatible.... - compatiblecompatible--------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -cpu_adam ............... cpu_adam[NO]cpu_adam ...............cpu_adam...................... ............... [NO][NO] [OKAY] [NO] - .............. [OKAY].......[OKAY] - -[OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_adamfused_adamfused_lamb fused_adam ............. .......................... ............. [NO][NO] [NO] [NO] .............. ....... .......[OKAY] [OKAY] -[OKAY] -[OKAY] - -fused_lambfused_lambfused_lamb .......................... .............[NO][NO] [NO] ....... sparse_attn .............. [OKAY] ............ -[OKAY][OKAY] -[NO] - ....... [OKAY] -transformer ............ [NO] .......sparse_attn [OKAY]............sparse_attn - ............ sparse_attn[NO]stochastic_transformer[NO] ................... [OKAY][NO]........ - [NO][OKAY]....... transformer - ....... [OKAY] ............ -[NO]transformer[OKAY] ....... -transformer ............[OKAY] ............ -[NO] [NO] stochastic_transformer.............. [OKAY]. - [NO][OKAY] -....... [OKAY]stochastic_transformer - stochastic_transformer. [NO]. .......[NO] .......[OKAY] -[OKAY] -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] - -[OKAY] --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - -op name op name ................op name ................ ................installed................ ..installedinstalledinstalled compatible...... - compatible--------------------------------------------------compatible - -compatible-------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam .......cpu_adamcpu_adam............... [OKAY]...............[NO] -............... [NO].......[NO] .......[OKAY] ....... -fused_adam[OKAY] -[OKAY]............. - [NO] ....... [OKAY] -fused_adamfused_lamb fused_adam.......................... fused_adam............. [NO][NO][NO]............. .....................[NO] [OKAY].......[OKAY] -[OKAY] - -[OKAY] -fused_lambfused_lamb fused_lamb.......................... .............[NO][NO] .......sparse_attn[NO] ....... [OKAY] ....... -............[OKAY] -[OKAY][NO] - ....... [OKAY] -transformer ............ [NO] ....... sparse_attnsparse_attn[OKAY] -............sparse_attn............ ............ [NO]stochastic_transformer[NO] [NO].............. ........ [OKAY][OKAY] - -[NO][OKAY] -transformer.......transformer transformer ............ ........................[OKAY] [NO] -[NO][NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - -stochastic_transformer stochastic_transformer .stochastic_transformer . [NO] . [NO] ....... [NO].......[OKAY] -.......[OKAY] - [OKAY] -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY] - -[OKAY][OKAY]-------------------------------------------------- --------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op name - -op name op name................ op name ................installed................ installed................ .. installed.. installed compatible compatible.. -.. - -------------------------------------------------- ---------------------------------------------------compatiblecompatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... cpu_adamcpu_adam...............[NO] ...............[NO]...................... [NO][NO][OKAY] -..................... [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO]fused_adamfused_adamfused_adam .............................................. [OKAY] [NO][NO] -[NO] ..................... fused_lamb [OKAY] [OKAY][OKAY]............. - - - [NO] fused_lamb....... fused_lambfused_lamb ............. [OKAY] ............. -.............[NO] [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - -sparse_attn ............ sparse_attn[NO] sparse_attn....... ............sparse_attn ............ [OKAY][NO]............ - [NO][NO] .......transformer ....... [OKAY]....... ............ - [OKAY] [NO][OKAY] -transformer - ................... transformer transformer[OKAY] - [NO]........................ .......[NO][NO] stochastic_transformer [OKAY]....... ....... - .[OKAY][OKAY] -stochastic_transformer[NO] - .......stochastic_transformer. stochastic_transformer [OKAY] -.[NO] . [NO] ....... [NO] ....... [OKAY][OKAY]....... - - [OKAY] ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... ..................[OKAY][OKAY] .................. -[OKAY] - ---------------------------------------------------[OKAY] --------------------------------------------------- --------------------------------------------------- -op name --------------------------------------------------- op name................op name - ................op nameinstalled................ .. installed ................compatibleinstalled - ....installed-------------------------------------------------- -compatible compatible -.. - ----------------------------------------------------------------------------------------------------compatible -cpu_adam - - ...............-------------------------------------------------- -[NO] .......cpu_adam cpu_adam[OKAY] - ..............................cpu_adam [NO][NO]............... .............. [NO]fused_adam [OKAY] [OKAY] -.................... - [NO][OKAY] -....... [OKAY] -fused_adam fused_adam.............fused_lamb fused_adam.............[NO]............. [NO] ............. .......[NO] ....... [OKAY][NO] ....... - [OKAY] fused_lamb....... -[OKAY] -.............[OKAY] -[NO]fused_lamb .......fused_lamb .............[OKAY]............. - sparse_attn[NO][NO] ............ ....... ....... [NO] [OKAY] [OKAY] -....... - sparse_attn[OKAY] -............ [NO]transformer ................... [OKAY][NO] - ....... sparse_attnsparse_attn[OKAY]transformer - ............ ............ ............ [NO]stochastic_transformer[NO] [NO].............. . .......[OKAY][OKAY] - -[NO][OKAY] ....... -stochastic_transformer transformer [OKAY] transformer -............. ............[NO] [NO] [NO] ....... ..............[OKAY] -[OKAY] -[OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY][OKAY] -[OKAY] - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop nameop name ................................................ ................installedinstalledinstalled ..installed .... compatible .. -compatiblecompatible - ---------------------------------------------------compatible-------------------------------------------------- - --------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam............... cpu_adam ............... ..............................[NO][NO] [NO][NO]....... ....... [OKAY] .............. - [OKAY][OKAY][OKAY] - - -fused_adam .............fused_adamfused_adam fused_adam[NO] .............................................. [NO][NO] [NO][OKAY] ....... ....... - ....... [OKAY] [OKAY] -[OKAY] - -fused_lambfused_lambfused_lamb fused_lamb .......................... .......................... [NO][NO][NO] [NO] .............. ....... ....... [OKAY][OKAY] - -[OKAY][OKAY] - -sparse_attn ............ sparse_attnsparse_attnsparse_attn[NO] ....... ........................ ............ [NO][NO][OKAY][NO] -..................... transformer[OKAY][OKAY][OKAY] - - -transformer............ transformer............transformer [NO] ............ [NO] .......................... [NO][NO] [OKAY] -[OKAY].............. - [OKAY]stochastic_transformer[OKAY]stochastic_transformer - - .. stochastic_transformer stochastic_transformer[NO] [NO] ................ [OKAY] [NO][NO] -[OKAY] -.............. [OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - -ninjaninjaninja ninja.................. .................. ..................[OKAY]..................[OKAY] - -[OKAY][OKAY]---------------------------------------------------------------------------------------------------- - - - -op name--------------------------------------------------op name-------------------------------------------------- -................ - ................op nameinstalled op name installed .. ................ .................. compatibleinstalledinstalled - compatible--------------------------------------------------.. - - --------------------------------------------------.. - compatible -compatible --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY]cpu_adamcpu_adam -cpu_adam .............................. [NO][NO] ....... ...................... [OKAY][OKAY] - -fused_adam [NO]............. .......[NO] [OKAY]....... - [OKAY] -fused_adamfused_adam .......................... fused_lamb [NO] [NO]............. .......fused_adam.......[NO] [OKAY][OKAY]....... - - .............[OKAY] -[NO]fused_lambfused_lamb .......................... .......[NO][NO] [OKAY] .............. - [OKAY][OKAY] - -sparse_attn fused_lamb............ .............[NO] ....... [OKAY][NO] - ....... sparse_attntransformer[OKAY] sparse_attn -........................ ............[NO][NO] [NO] ..................... [OKAY][OKAY][OKAY] - - -sparse_attntransformerstochastic_transformertransformer ............ . ........................[NO] [NO][NO] .............. [NO][OKAY] .......[OKAY] -.......[OKAY] - -[OKAY] -stochastic_transformer stochastic_transformer .. transformer [NO] [NO] ............ ..............[NO] [OKAY].......[OKAY] - -[OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja .................................... .................. .................. [OKAY][OKAY] [OKAY] - -[OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op name--------------------------------------------------op nameop name - ................................................op name installed installedinstalled ................ ......installed compatible..compatiblecompatible - - -compatible---------------------------------------------------------------------------------------------------- - --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam cpu_adam .......cpu_adam............... [OKAY]............... - ...............[NO][NO] [NO].............. .......[OKAY][OKAY] -[OKAY] -fused_adam - ............. [NO] ....... [OKAY] -fused_adamfused_adam fused_lambfused_adam............. .......................... [NO][NO].............[NO] ....... .......[NO].......[OKAY] - .......[OKAY][OKAY] fused_lamb - -[OKAY] ............. - [NO]fused_lamb .......fused_lamb............. [OKAY].............[NO] - sparse_attn[NO]....... ................... [OKAY] [NO] -[OKAY] -....... [OKAY] -sparse_attntransformer ........................ [NO][NO] sparse_attn.............. sparse_attn [OKAY] ............[OKAY] -............ -[NO][NO]stochastic_transformer transformer.............. .............[OKAY] [OKAY] - -[NO][NO]transformer transformer.......................... ............[NO][OKAY] [OKAY] -....... -[NO] [OKAY]....... -stochastic_transformer [OKAY] -.stochastic_transformer stochastic_transformer[NO] ........ . [NO][OKAY][NO] - .............. [OKAY][OKAY] - -ninjaninjaninja ninja .................. .................................... ..................[OKAY] -[OKAY][OKAY][OKAY]-------------------------------------------------- - - - -----------------------------------------------------------------------------------------------------op name --------------------------------------------------- - -op name................op nameop name ................................installed ................installedinstalled .. installed .... compatible - ..compatible--------------------------------------------------compatible - -compatible --------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -cpu_adam cpu_adam............... cpu_adam ............... cpu_adam[NO] [NO]............... ............... ....... .......[NO] [NO] [OKAY] [OKAY] -.............. - [OKAY][OKAY] - -fused_adam ............. [NO] .......fused_adam fused_adamfused_adam[OKAY] ............. -............. ............. [NO] [NO] [NO]fused_lamb ....... .................... ....... [OKAY] [NO][OKAY] -[OKAY] - -....... [OKAY]fused_lamb -fused_lamb fused_lamb ............. ............. ............. [NO] [NO][NO]....... sparse_attn .............. [OKAY] -[OKAY][OKAY]............ - - [NO] ....... [OKAY] -transformer ............ [NO] sparse_attn.......sparse_attn sparse_attn[OKAY] -........................ ............[NO]stochastic_transformer[NO] [NO] ....... ........ ....... [OKAY] [OKAY][OKAY][NO] - - - .......transformertransformer transformer [OKAY] ........................ - ............ [NO] [NO] [NO] ....... .............. [OKAY][OKAY] -[OKAY] - -stochastic_transformerstochastic_transformer stochastic_transformer .. . [NO][NO] ..............[NO] [OKAY][OKAY]....... - - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja -JIT compiled ops requires ninja - -ninjaninjaninjaninja .................................... ....................................[OKAY] [OKAY] -[OKAY] - -[OKAY]---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op nameop name - ................op name ................op name installed ................installed................ installed....installed compatible -..compatible..-------------------------------------------------- - -compatiblecompatible-------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam ...................... cpu_adam[OKAY]cpu_adam -[NO] ..................................... [NO][NO][OKAY] -.............. [OKAY]fused_adam -[OKAY] -............. [NO] ....... fused_adam[OKAY] -............. [NO]fused_adam fused_lamb ....................fused_adam ............. [NO][OKAY] .............[NO] -....... .......[OKAY]fused_lamb -[NO] [OKAY].................... - [NO]fused_lamb[OKAY] ............. -....... sparse_attn[NO][OKAY]fused_lamb - ............ ....... ............. [NO] [OKAY][NO]....... - .......[OKAY] - [OKAY]sparse_attn --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -transformer ........................ [NO][NO] .......sparse_attn....... [OKAY][OKAY]............ - -JIT compiled ops requires ninja -sparse_attn transformer[NO]............ ............stochastic_transformer....... [NO] [NO][OKAY] . -....... .......[NO]transformer [OKAY] -.......[OKAY]............ - [OKAY]stochastic_transformer[NO]transformer - ................... . [OKAY] [NO] -[NO] .............. stochastic_transformer[OKAY][OKAY] - -. [NO] .......stochastic_transformer [OKAY] -. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op name -op name op name op name................ ................ ................................installedinstalled installedinstalled.... .. ..compatiblecompatiblecompatible - - -compatible------------------------------------------------------------------------------------------------------------------------------------------------------ - - - --------------------------------------------------- -cpu_adamcpu_adamcpu_adam cpu_adam ............... .............................. ............... [NO][NO] [NO] [NO].............. ..............[OKAY][OKAY] - -[OKAY][OKAY] - -fused_adam .............fused_adam fused_adam[NO]fused_adam ............. .................... .............[NO] [OKAY][NO][NO] - ....... .......[OKAY]....... -fused_lamb [OKAY] [OKAY]............. -fused_lamb - fused_lamb[NO]............. fused_lamb .................... [NO] ....................[OKAY][NO] [OKAY] - ....... - [NO][OKAY] -....... [OKAY] -sparse_attn ............sparse_attn [NO]sparse_attn............ .......sparse_attn[NO]............ [OKAY] ................... -[NO] transformer [NO][OKAY] ....... -............ ....... [OKAY] [NO]transformer - [OKAY] .......transformer -............ [OKAY][NO] -............transformer ...................stochastic_transformer [NO] [NO] [OKAY] ........ -....... [NO][OKAY][OKAY] stochastic_transformer - -....... .[OKAY] -stochastic_transformer[NO]stochastic_transformer ........ .[OKAY][NO] - [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja -ninjaninjaninja ninja .................. .................................... ..................[OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op name -op name op name................ op name ................................installed installed ................installed .. ..installed .. compatiblecompatible - -compatible..---------------------------------------------------------------------------------------------------- - - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam ...............cpu_adam............... cpu_adam [NO][NO] ............... ...................... ....... [NO] [OKAY] -[NO][OKAY]....... - .......[OKAY] -[OKAY] -fused_adam ............. fused_adam[NO] .................... fused_adamfused_adam [NO] [OKAY] .................... - ............. [OKAY][NO][NO] - fused_lamb....... ....... fused_lamb............. [OKAY] .............[OKAY] [NO] -[NO] - .......fused_lamb .......[OKAY] fused_lamb -[OKAY]............. - .............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ sparse_attn[NO] ................... sparse_attnsparse_attn[NO] [OKAY]............................... - [OKAY]transformer[NO] -[NO]............ .......[NO] transformer ....... [OKAY]....... ............ - [OKAY][OKAY][NO] - -transformer .......transformerstochastic_transformer ............ [OKAY]............ -. [NO][NO][NO] stochastic_transformer..................... [OKAY] -[OKAY][OKAY]. - - [NO] .......stochastic_transformer [OKAY] stochastic_transformer - . .[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................. .................. [OKAY][OKAY] [OKAY] - -[OKAY] --------------------------------------------------- --------------------------------------------------- - -----------------------------------------------------------------------------------------------------op name - op name -op name................ op name ................installed................ ..................installedinstalled installedcompatible.. -.. --------------------------------------------------..compatiblecompatible - - -compatible-------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [NO] cpu_adam.......cpu_adamcpu_adam ...............[OKAY].............................. - [NO][NO][NO] .............. .......[OKAY][OKAY] - -[OKAY]fused_adam - ............. [NO] ....... [OKAY] -fused_adamfused_adam fused_adamfused_lamb............. .......................................[NO] [NO][NO][NO]....... ....... ....... .......[OKAY][OKAY] - -[OKAY][OKAY] - -fused_lamb fused_lamb.............fused_lamb .............[NO]............. [NO].......[NO] sparse_attn ....... [OKAY] ...................[OKAY] - -[OKAY][NO] - ....... [OKAY] -transformer ............ [NO] .......sparse_attn [OKAY]............sparse_attn - sparse_attn [NO] ............ ............stochastic_transformer....... [NO][OKAY] [NO]. -....... [NO][OKAY].......transformer - .......[OKAY]............ transformer -[OKAY] -[NO]............transformer ....... [NO] ............[OKAY] -.......[NO] [OKAY]....... - [OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer. . [NO].[NO] .......[NO] ....... [OKAY] -.......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report --------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name -op name op name op name................ ................ installed................installed................ installed....installed .. compatiblecompatible -.. --------------------------------------------------- --------------------------------------------------compatible - -compatible - --------------------------------------------------- --------------------------------------------------- -cpu_adamcpu_adam ..............................cpu_adam cpu_adam [NO] [NO]............... .............................[NO] [OKAY][NO][OKAY] - ....... -....... [OKAY][OKAY] - -fused_adam fused_adam............. [NO]fused_adamfused_adam............. ................................. [NO] [NO][OKAY][NO] ....... -....... .......[OKAY] -[OKAY][OKAY]fused_lamb - - .............fused_lamb [NO]fused_lamb.............fused_lamb ....... ............. [NO]............. [OKAY] -[NO].......[NO] .......[OKAY]....... - [OKAY][OKAY] - -sparse_attn ............ [NO] ....... sparse_attnsparse_attnsparse_attn[OKAY] -.................................... transformer[NO][NO][NO] ............ .............. ....... [NO] [OKAY][OKAY][OKAY] - -....... - [OKAY]transformertransformer - transformer ............ ............ ............ [NO] stochastic_transformer[NO] [NO] ....... ....... ........ [OKAY] [NO][OKAY][OKAY] - - -....... [OKAY] -stochastic_transformerstochastic_transformerstochastic_transformer ... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report --------------------------------------------------- - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninja .................. ninja.................................... [OKAY]..................[OKAY][OKAY] - -[OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name op name ................ ................................ ................ installedinstalledinstalled installed ...... ..compatiblecompatible -compatible -compatible-------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -cpu_adamcpu_adam cpu_adam............... cpu_adam .............................. [NO] ............... [NO] [NO] .......[NO] ....... ....... [OKAY]....... [OKAY] - - [OKAY][OKAY] - -fused_adam .............fused_adam [NO]fused_adam............. fused_adam[NO]....... ............. ............. [OKAY] .......[NO] -[NO] .......[OKAY]fused_lamb ....... -[OKAY] -.............[OKAY]fused_lamb - fused_lamb[NO]............. ....................fused_lamb [NO] [OKAY][NO] ............. - ....... ....... [NO][OKAY] -[OKAY]....... - [OKAY] -sparse_attn ............ [NO] ....... [OKAY]sparse_attn - ............sparse_attn [NO]............ sparse_attntransformer ....... [NO]............[OKAY] ............ -....... [NO]transformer [OKAY][NO] - .......................... transformer[OKAY][NO] [OKAY] -................... - stochastic_transformer[OKAY][NO]transformer -.................... [OKAY][NO] -[NO]stochastic_transformer ..............stochastic_transformer. [NO][OKAY].[OKAY] - -....... [NO][OKAY]stochastic_transformer -....... [OKAY]. - [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninja ninja.................. .................. .................................... [OKAY] [OKAY] -[OKAY][OKAY] --------------------------------------------------- - - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - -................op nameop nameop name ................ installed................ ................ installed..installed compatibleinstalled.... - --------------------------------------------------compatiblecompatible.. - - - ----------------------------------------------------------------------------------------------------compatible - - ---------------------------------------------------cpu_adam - ............... [NO] cpu_adamcpu_adam....... [OKAY]............... -cpu_adam............... [NO]............... [NO] .......[NO] .......[OKAY]....... -fused_adam [OKAY] [OKAY] -............. - [NO] ....... [OKAY] -fused_adam .............fused_lamb [NO]............. fused_adamfused_adam....... .............[NO][OKAY] -............. ....... [NO][NO][OKAY] fused_lamb -....... ....... ............. [OKAY] [OKAY] -[NO] - .......fused_lamb fused_lamb [OKAY] sparse_attn -.......................... ............[NO][NO] [NO].............. .......[OKAY][OKAY] -[OKAY]sparse_attn - - ............ transformer[NO] ................... [NO][OKAY] -....... [OKAY] -sparse_attntransformer ............sparse_attnstochastic_transformer............ [NO]............. [NO] [NO]....... [NO] .............. [OKAY] ....... - [OKAY] [OKAY] -[OKAY] -stochastic_transformer - transformertransformer. ........................[NO] [NO][NO]....... .......[OKAY]....... - [OKAY][OKAY] - -stochastic_transformer stochastic_transformer. [NO]. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] - - ---------------------------------------------------[OKAY]-------------------------------------------------- - --------------------------------------------------- -op name ---------------------------------------------------op name op name -................ ................ op name ................ installedinstalled ................ installed.. .. installed compatible..compatible.. - - ----------------------------------------------------------------------------------------------------compatiblecompatible - - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ...............cpu_adam cpu_adam...............[NO]cpu_adam [NO]..................................... ....... [OKAY][NO] -[NO] [OKAY] -.............. [OKAY][OKAY] - -fused_adam ............. [NO] fused_adam....... fused_adam............. fused_adam [OKAY] [NO] -.......................... ....... fused_lamb[NO] [NO] [OKAY]............. -....... ....... [NO][OKAY] fused_lamb -[OKAY]....... ............. -[OKAY] fused_lamb -[NO] fused_lamb .................... ............. [NO] [OKAY] [NO] -....... .......[OKAY] [OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -transformer sparse_attn............ ............[NO] sparse_attn sparse_attn....... [NO]........................ [OKAY]....... [NO] - [NO] [OKAY] stochastic_transformer....... -....... transformer[OKAY].[OKAY] - -............[NO] transformer[NO]transformer....... ....... ........................ [OKAY][OKAY] - -[NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer stochastic_transformer.stochastic_transformer [NO]. . ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] .......  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.[NO] - -async_iotransformer_inference .. ...............[NO] .......[NO] [OKAY]....... - [NO] -utils .................. [NO] ....... [OKAY] -quantizer transformer_inference.............. ..[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_ioutils ................................. [NO][NO] .............. [OKAY][NO] - -quantizer .............. [NO] ....... [OKAY] -transformer_inference-------------------------------------------------- -.. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yumasync_io - ............... [NO] ....... [NO] -DeepSpeed general environment info: -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -quantizer .............. [NO] ....... [OKAY] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] --------------------------------------------------- -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ...............async_io [NO] ...................... [NO][NO] -....... [NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer ..............quantizer [NO] [WARNING]  async_io: please install the libaio-devel package with yum.............. .......[NO] [OKAY] -....... - [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -async_io ............... [NO] ....... [NO] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY] -[OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - ............... [NO] ....... [NO] -async_io ............... [NO] ....... transformer_inference[NO] -.. [NO] ....... [OKAY] -utils .................. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -quantizer .............. [NO] .......utils [OKAY].................. - [NO] ....... --------------------------------------------------[OKAY] - -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... async_io[NO] ....... ...............[NO] -[NO] ....... [NO] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -transformer_inference .. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] -utils .................. utils[NO] ......................... [NO][OKAY] -....... [OKAY] -quantizer ..............quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -DeepSpeed general environment info: -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master - [WARNING]  async_io: please install the libaio-devel package with yum -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -DeepSpeed general environment info: -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -DeepSpeed general environment info: -utils .................. [NO] ....... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -quantizer .............. [NO] ....... [OKAY] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 --------------------------------------------------- -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - [WARNING]  async_io: please install the libaio-devel package with yum -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_ioasync_io ............... [NO]............... .......[NO] [NO]....... - [NO] -transformer_inferencetransformer_inference .... [NO][NO] ....... .......[OKAY] -[OKAY] -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizer ..............quantizer [NO].............. .......[NO] [OKAY]....... -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -nvcc version ..................... 11.2 -async_io ............... [NO] ....... [NO] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch versionDeepSpeed general environment info: .................... -1.8.1 -torch cuda version ...............torch install path 11.1 -...............nvcc version ..................... 11.2 -deepspeed install path ...........['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']torch version - deepspeed info.................... ...................1.8.1 -0.5.5+29bee73, 29bee73, master -torch cuda versiondeepspeed wheel compiled w. ..................... 11.1torch 1.8, cuda 11.1 - -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... [NO] ....... [NO] -async_io ............... async_io[NO] ....... ...............[NO] -[NO] ....... [NO]transformer_inference - .. [NO] ....... [OKAY] -transformer_inference .. [NO] .......transformer_inference utils [OKAY] .. - ..................[NO] [NO]....... .......utils[OKAY] -[OKAY].................. - [NO] ....... [OKAY]utilsquantizer - ................................ quantizer [NO] [NO] .............. ....... .......[NO] [OKAY].......[OKAY] - -[OKAY] -quantizer ..............-------------------------------------------------- -[NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ......DeepSpeed general environment info: torch 1.8, cuda 11.1 - -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io async_io............... [NO]............... .......[NO] [NO]....... - [NO] -transformer_inference .. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -utils .................. utils[NO] ......................... [NO][OKAY] -....... [OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - [WARNING]  async_io: please install the libaio-devel package with yum -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - [WARNING]  async_io: please install the libaio-devel package with yum -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -.................. [NO] ....... [OKAY] -quantizer .............. [NO] async_io....... [OKAY] -............... [NO] .......-------------------------------------------------- -[NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -DeepSpeed general environment info: -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  async_io: please install the libaio-devel package with yum -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -async_io ............... [NO] ....... [NO] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install pathDeepSpeed general environment info: ............... -torch install path['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - ...............torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']torch cuda version - ............... torch version11.1 -....................nvcc version 1.8.1..................... - 11.2 -torch cuda versiondeepspeed install path .......................... 11.1 -nvcc version['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -.....................deepspeed info 11.2................... - deepspeed install path0.5.5+29bee73, 29bee73, master -...........deepspeed wheel compiled w. ...... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']torch 1.8, cuda 11.1 - -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -DeepSpeed general environment info:torch version - .................... 1.8.1 -torch install path torch cuda version............... ............... 11.1 -nvcc version .....................['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -11.2 -deepspeed install path torch version........... .................... 1.8.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -torch cuda versiondeepspeed info .................................. 11.10.5.5+29bee73, 29bee73, master - -nvcc versiondeepspeed wheel compiled w. ........................... 11.2torch 1.8, cuda 11.1 - -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ...............DeepSpeed general environment info: 11.1 - -nvcc version ..................... 11.2torch install path -deepspeed install path .......................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ...................['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -0.5.5+29bee73, 29bee73, master -torch versiondeepspeed wheel compiled w. .......................... 1.8.1torch 1.8, cuda 11.1 - -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - ...........deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1725083.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 150 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.006 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 0.0001 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 3750000 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_bnb_optimizer ............................... False - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> setting tensorboard ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-27 21:22:39,117] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' ->>> done with dataset index builder. Compilation time: 0.307 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 21.035 seconds -time to initialize megatron (seconds): 20.592 -[after megatron is initialized] datetime: 2021-10-27 21:23:00 -building GPT model ... -[2021-10-27 21:23:00,662] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-27 21:23:00,662] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-27 21:23:00,663] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.07 GB, percent = 20.9% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-27 21:23:02,336] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - - -[2021-10-27 21:23:03,028] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-27 21:23:03,029] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-27 21:23:03,029] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.24 GB, percent = 21.0% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-27 21:23:03,030] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+29bee73, git-hash=29bee73, git-branch=master -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-27 21:23:03,069] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-27 21:23:03,069] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-27 21:23:03,069] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-27 21:23:03,070] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-27 21:23:03,070] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-27 21:23:03,070] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-27 21:23:03,070] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-27 21:23:03,070] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-27 21:23:03,070] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-27 21:23:03,070] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 0.7808740139007568 secondsTime to load utils op: 0.779437780380249 seconds - -Time to load utils op: 0.7813599109649658 seconds -Time to load utils op: 0.7816610336303711 seconds -Time to load utils op: 0.7787785530090332 seconds -Time to load utils op: 0.7771420478820801 seconds -Time to load utils op: 0.7777352333068848 secondsTime to load utils op: 0.77335524559021 seconds - -Time to load utils op: 0.7773048877716064 seconds -Time to load utils op: 0.7802278995513916 seconds -Time to load utils op: 0.7761890888214111 seconds -Time to load utils op: 0.7716977596282959 secondsTime to load utils op: 0.775770902633667 seconds -Time to load utils op: 0.7763783931732178 seconds - -Time to load utils op: 0.7794530391693115 seconds -Time to load utils op: 0.779348611831665 seconds -Time to load utils op: 0.7794966697692871 seconds -Time to load utils op: 0.781407356262207 seconds -Time to load utils op: 0.7760372161865234 seconds -Time to load utils op: 0.7826178073883057 seconds -Time to load utils op: 0.7824904918670654 seconds -Time to load utils op: 0.7824552059173584 seconds -Time to load utils op: 0.7868561744689941 seconds -Time to load utils op: 0.7852318286895752 seconds -Time to load utils op: 0.7867424488067627 seconds -Time to load utils op: 0.786372184753418 seconds -Time to load utils op: 0.7816011905670166 seconds -Time to load utils op: 0.7771217823028564 secondsTime to load utils op: 0.781425952911377 seconds - -Time to load utils op: 0.7835760116577148 seconds -Time to load utils op: 0.7832014560699463 secondsTime to load utils op: 0.7821853160858154 seconds - -Time to load utils op: 0.7787477970123291 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 0.7774002552032471 secondsTime to load utils op: 0.7759287357330322 seconds -Time to load utils op: 0.7786221504211426 seconds - -Time to load utils op: 0.7776010036468506 seconds -Time to load utils op: 0.7845418453216553 seconds -Time to load utils op: 0.7865960597991943 seconds -Time to load utils op: 0.7863655090332031 secondsTime to load utils op: 0.787574291229248 seconds - -Time to load utils op: 0.7870609760284424 seconds -Time to load utils op: 0.7856636047363281 secondsTime to load utils op: 0.7858741283416748 seconds - -Time to load utils op: 0.7826378345489502 seconds -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 0.7977445125579834 seconds -Time to load utils op: 0.794550895690918 seconds -Time to load utils op: 0.7983038425445557 seconds -Time to load utils op: 0.7948377132415771 seconds -Time to load utils op: 0.796027421951294 seconds -Time to load utils op: 0.7958085536956787 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.8128108978271484 seconds -Time to load utils op: 0.8127410411834717 seconds -Time to load utils op: 0.8082389831542969 secondsTime to load utils op: 0.8063948154449463 seconds - -Time to load utils op: 0.8061277866363525 seconds -Time to load utils op: 0.8120148181915283 seconds -Time to load utils op: 0.8146884441375732 secondsTime to load utils op: 0.8144698143005371 seconds - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 0.8199586868286133 seconds -Time to load utils op: 0.8193337917327881 seconds -Time to load utils op: 0.8151676654815674 seconds -Time to load utils op: 0.8208065032958984 seconds -Time to load utils op: 0.8205430507659912 seconds -Time to load utils op: 0.816511869430542 seconds -Time to load utils op: 0.8185043334960938 seconds -Time to load utils op: 0.8178966045379639 seconds -Time to load utils op: 0.8175444602966309 seconds -Time to load utils op: 0.8160223960876465 seconds -Time to load utils op: 0.8172924518585205 secondsTime to load utils op: 0.8159899711608887 seconds - -Time to load utils op: 0.8240005970001221 seconds -Time to load utils op: 0.823817253112793 seconds -Time to load utils op: 0.8246185779571533 secondsTime to load utils op: 0.8223936557769775 seconds - -Time to load utils op: 0.7349064350128174 seconds -Time to load utils op: 0.7291369438171387 seconds -Time to load utils op: 0.742849588394165 seconds -Time to load utils op: 0.7428832054138184 seconds -Time to load utils op: 0.8206753730773926 seconds -Time to load utils op: 0.8193259239196777 seconds -Time to load utils op: 0.8191845417022705 seconds -Time to load utils op: 0.8176002502441406 seconds -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.8239963054656982 seconds -Time to load utils op: 0.8236329555511475 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Time to load utils op: 0.748835563659668 seconds -Time to load utils op: 0.8253920078277588 seconds -Time to load utils op: 0.8271651268005371 seconds -Time to load utils op: 0.8272991180419922 seconds -Time to load utils op: 0.82716965675354 secondsTime to load utils op: 0.8258497714996338 seconds - -Time to load utils op: 0.8319792747497559 seconds -Time to load utils op: 0.8277068138122559 seconds -Time to load utils op: 0.8317873477935791 seconds -Time to load utils op: 0.8313174247741699 seconds -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Time to load utils op: 0.8285672664642334 seconds -Time to load utils op: 0.8292522430419922 secondsTime to load utils op: 0.8262183666229248 seconds - -Time to load utils op: 0.740135908126831 seconds -Time to load utils op: 0.7588427066802979 secondsTime to load utils op: 0.7526493072509766 seconds - -Time to load utils op: 0.8406631946563721 seconds -Time to load utils op: 0.8358447551727295 seconds -Time to load utils op: 0.8392562866210938 seconds -Time to load utils op: 0.8405194282531738 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Time to load utils op: 0.8350875377655029 seconds -Time to load utils op: 0.8370249271392822 seconds -Time to load utils op: 0.8372929096221924 seconds -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Time to load utils op: 0.8467998504638672 seconds -Time to load utils op: 0.8588583469390869 seconds -Time to load utils op: 0.8511030673980713 seconds -Time to load utils op: 0.8536303043365479 seconds -Time to load utils op: 0.8539767265319824 seconds -Time to load utils op: 0.8490481376647949 secondsTime to load utils op: 0.8446812629699707 secondsTime to load utils op: 0.8481013774871826 seconds - - -Time to load utils op: 0.8540687561035156 seconds -Time to load utils op: 0.8527994155883789 seconds -Time to load utils op: 0.8540661334991455 seconds -Time to load utils op: 0.854271650314331 seconds -Time to load utils op: 0.8558697700500488 seconds -Time to load utils op: 0.8530623912811279 seconds -Time to load utils op: 0.8577611446380615 seconds -Time to load utils op: 0.8516781330108643 seconds -Time to load utils op: 0.8583393096923828 seconds -Time to load utils op: 0.8608193397521973 seconds -Time to load utils op: 0.8601117134094238 seconds -Time to load utils op: 0.8598062992095947 seconds -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -[2021-10-27 21:23:06,324] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -[2021-10-27 21:23:06,325] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -[2021-10-27 21:23:06,325] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.27 GB, percent = 21.0% -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.0010280609130859375 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.0010476112365722656 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010099411010742188 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010366439819335938 seconds -No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - - -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0022530555725097656 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.002471446990966797 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Time to load utils op: 0.002399444580078125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0018587112426757812 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001255035400390625 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0021009445190429688 seconds -Time to load utils op: 0.0019567012786865234 seconds -Time to load utils op: 0.0012590885162353516 seconds -Time to load utils op: 0.0013012886047363281 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001802682876586914 seconds -Time to load utils op: 0.0014717578887939453 seconds -Time to load utils op: 0.0012216567993164062 seconds -Time to load utils op: 0.002270221710205078 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0015187263488769531 seconds -Time to load utils op: 0.0012290477752685547 seconds -Time to load utils op: 0.0023071765899658203 seconds -Time to load utils op: 0.0014290809631347656 seconds -Time to load utils op: 0.0012774467468261719 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001516103744506836 seconds -Time to load utils op: 0.0016963481903076172 secondsTime to load utils op: 0.001743316650390625 seconds - -Time to load utils op: 0.0017783641815185547 seconds -Time to load utils op: 0.0012097358703613281 seconds -Time to load utils op: 0.0012712478637695312 seconds -Time to load utils op: 0.0012764930725097656 seconds -Time to load utils op: 0.0014269351959228516 seconds -Time to load utils op: 0.0014083385467529297 seconds -Time to load utils op: 0.0015959739685058594 seconds -Time to load utils op: 0.0014026165008544922 seconds -Time to load utils op: 0.0020325183868408203 seconds -Time to load utils op: 0.0016274452209472656 seconds -Time to load utils op: 0.001714944839477539 seconds -Time to load utils op: 0.001430511474609375 secondsTime to load utils op: 0.0013048648834228516 seconds - -Time to load utils op: 0.0015730857849121094 seconds -Time to load utils op: 0.002042531967163086 seconds -Time to load utils op: 0.001481771469116211 secondsTime to load utils op: 0.0014858245849609375 seconds - -Time to load utils op: 0.0016551017761230469 secondsTime to load utils op: 0.0019447803497314453 seconds - -Time to load utils op: 0.0018334388732910156 seconds -Time to load utils op: 0.0017006397247314453 seconds -Time to load utils op: 0.0018835067749023438 seconds -Time to load utils op: 0.0018398761749267578 seconds -Time to load utils op: 0.0013301372528076172 seconds -Time to load utils op: 0.0012886524200439453 seconds -Time to load utils op: 0.0013208389282226562 seconds -Time to load utils op: 0.001455068588256836 seconds -Time to load utils op: 0.0012178421020507812 seconds -Time to load utils op: 0.0012531280517578125 seconds -Time to load utils op: 0.0015759468078613281 seconds -Time to load utils op: 0.0012094974517822266 seconds -Time to load utils op: 0.0012044906616210938 seconds -Time to load utils op: 0.0012538433074951172 seconds -Time to load utils op: 0.0021944046020507812 seconds -Time to load utils op: 0.0014719963073730469 seconds -Time to load utils op: 0.0019495487213134766 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0020422935485839844 seconds -Time to load utils op: 0.0013186931610107422 seconds -Time to load utils op: 0.0013227462768554688 seconds -Time to load utils op: 0.0019388198852539062 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0017066001892089844 seconds -Loading extension module utils... -Time to load utils op: 0.0014052391052246094 seconds -Time to load utils op: 0.001230001449584961 seconds -Time to load utils op: 0.0013701915740966797 seconds -Time to load utils op: 0.001142740249633789 secondsTime to load utils op: 0.0012278556823730469 seconds - -Time to load utils op: 0.001955747604370117 seconds -Time to load utils op: 0.002138376235961914 seconds -Time to load utils op: 0.0012030601501464844 seconds -Time to load utils op: 0.0018911361694335938 seconds -Time to load utils op: 0.0020508766174316406 seconds -Time to load utils op: 0.001973867416381836 seconds -Time to load utils op: 0.0013840198516845703 seconds -Time to load utils op: 0.001157522201538086 seconds -Time to load utils op: 0.0019214153289794922 seconds -Time to load utils op: 0.0019061565399169922 seconds -Time to load utils op: 0.0013828277587890625 seconds -Time to load utils op: 0.0012362003326416016 seconds -Time to load utils op: 0.001367330551147461 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0017108917236328125 seconds -Time to load utils op: 0.0018038749694824219 seconds -Time to load utils op: 0.0015819072723388672 seconds -Time to load utils op: 0.0016813278198242188 seconds -Time to load utils op: 0.0016660690307617188 seconds -Time to load utils op: 0.0017657279968261719 secondsTime to load utils op: 0.001802682876586914 seconds - -Time to load utils op: 0.0017733573913574219 secondsTime to load utils op: 0.0016715526580810547 seconds - -Time to load utils op: 0.0017235279083251953 seconds -Time to load utils op: 0.0023589134216308594 seconds -Time to load utils op: 0.0019121170043945312 seconds -Time to load utils op: 0.003246784210205078 seconds -Time to load utils op: 0.0016584396362304688 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.002439737319946289 seconds -Time to load utils op: 0.0023109912872314453 seconds -Time to load utils op: 0.0028722286224365234 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.002614736557006836 seconds -Loading extension module utils... -Time to load utils op: 0.0011751651763916016 seconds -Time to load utils op: 0.0013415813446044922 seconds -Time to load utils op: 0.0012636184692382812 secondsTime to load utils op: 0.001138925552368164 seconds - -Time to load utils op: 0.003387451171875 seconds -Time to load utils op: 0.0034122467041015625 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012781620025634766 seconds -Time to load utils op: 0.0013091564178466797 secondsTime to load utils op: 0.0013666152954101562 seconds - -Time to load utils op: 0.0013697147369384766 seconds -Time to load utils op: 0.0013110637664794922 seconds -Time to load utils op: 0.0014240741729736328 seconds -Time to load utils op: 0.0014188289642333984 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013689994812011719 seconds -Time to load utils op: 0.0014221668243408203 seconds -Time to load utils op: 0.0013546943664550781 seconds -Time to load utils op: 0.0013494491577148438 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013740062713623047 seconds -Time to load utils op: 0.0014612674713134766 seconds -Time to load utils op: 0.0014040470123291016 seconds -Time to load utils op: 0.0015492439270019531 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Time to load utils op: 0.0013828277587890625 seconds -Time to load utils op: 0.00135040283203125 secondsTime to load utils op: 0.0011870861053466797 seconds - -Time to load utils op: 0.0013580322265625 seconds -[2021-10-27 21:23:06,371] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-27 21:23:06,372] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-27 21:23:06,372] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.27 GB, percent = 21.0% -[2021-10-27 21:23:06,372] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-27 21:23:06,401] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-27 21:23:06,402] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-27 21:23:06,402] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 39.27 GB, percent = 21.0% -[2021-10-27 21:23:06,402] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-27 21:23:06,402] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-27 21:23:06,402] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-27 21:23:06,402] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-27 21:23:06,402] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-27 21:23:06,402] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-27 21:23:06,403] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-27 21:23:06,404] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-27 21:23:06,405] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0008165836334228516 seconds -[2021-10-27 21:23:06,406] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-27 21:23:06,798] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) - > using checkpoint value 0.0001 for learning rate - > using checkpoint value 6e-06 for minimum learning rate - > using checkpoint value 3750000 for warmup iterations - > using checkpoint value 600000000 for total number of iterations - > using checkpoint value cosine for decay style -successfully loaded 1 ZeRO state_dicts for rank 122 -successfully loaded 1 ZeRO state_dicts for rank 117 -successfully loaded 1 ZeRO state_dicts for rank 124 -successfully loaded 1 ZeRO state_dicts for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 11 -successfully loaded 1 ZeRO state_dicts for rank 127 -successfully loaded 1 ZeRO state_dicts for rank 112 -successfully loaded 1 ZeRO state_dicts for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 118 -successfully loaded 1 ZeRO state_dicts for rank 60 -successfully loaded 1 ZeRO state_dicts for rank 101 -successfully loaded 1 ZeRO state_dicts for rank 4 -successfully loaded 1 ZeRO state_dicts for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 10 -successfully loaded 1 ZeRO state_dicts for rank 64 -successfully loaded 1 ZeRO state_dicts for rank 84 -successfully loaded 1 ZeRO state_dicts for rank 123 -successfully loaded 1 ZeRO state_dicts for rank 88 -successfully loaded 1 ZeRO state_dicts for rank 119 -successfully loaded 1 ZeRO state_dicts for rank 9 -successfully loaded 1 ZeRO state_dicts for rank 113 -successfully loaded 1 ZeRO state_dicts for rank 96 -successfully loaded 1 ZeRO state_dicts for rank 116 -successfully loaded 1 ZeRO state_dicts for rank 125 -successfully loaded 1 ZeRO state_dicts for rank 114 -successfully loaded 1 ZeRO state_dicts for rank 40 -successfully loaded 1 ZeRO state_dicts for rank 121 -successfully loaded 1 ZeRO state_dicts for rank 8 -successfully loaded 1 ZeRO state_dicts for rank 91 -successfully loaded 1 ZeRO state_dicts for rank 97 -successfully loaded 1 ZeRO state_dicts for rank 104 -successfully loaded 1 ZeRO state_dicts for rank 120 -successfully loaded 1 ZeRO state_dicts for rank 54 -successfully loaded 1 ZeRO state_dicts for rank 87 -successfully loaded 1 ZeRO state_dicts for rank 47 -successfully loaded 1 ZeRO state_dicts for rank 45 -successfully loaded 1 ZeRO state_dicts for rank 43 -successfully loaded 1 ZeRO state_dicts for rank 66 -successfully loaded 1 ZeRO state_dicts for rank 32 -successfully loaded 1 ZeRO state_dicts for rank 44 -successfully loaded 1 ZeRO state_dicts for rank 62 -successfully loaded 1 ZeRO state_dicts for rank 69 -successfully loaded 1 ZeRO state_dicts for rank 33 -successfully loaded 1 ZeRO state_dicts for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 90 -successfully loaded 1 ZeRO state_dicts for rank 26 -successfully loaded 1 ZeRO state_dicts for rank 15 -successfully loaded 1 ZeRO state_dicts for rank 126 -successfully loaded 1 ZeRO state_dicts for rank 98 -successfully loaded 1 ZeRO state_dicts for rank 12 -successfully loaded 1 ZeRO state_dicts for rank 63 -successfully loaded 1 ZeRO state_dicts for rank 52 -successfully loaded 1 ZeRO state_dicts for rank 79 -successfully loaded 1 ZeRO state_dicts for rank 51 -successfully loaded 1 ZeRO state_dicts for rank 58 -successfully loaded 1 ZeRO state_dicts for rank 27 -successfully loaded 1 ZeRO state_dicts for rank 109 -successfully loaded 1 ZeRO state_dicts for rank 106 -successfully loaded 1 ZeRO state_dicts for rank 5 -loading 1 zero partition checkpoints for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 71 -successfully loaded 1 ZeRO state_dicts for rank 7 -successfully loaded 1 ZeRO state_dicts for rank 41 -successfully loaded 1 ZeRO state_dicts for rank 6 -successfully loaded 1 ZeRO state_dicts for rank 76 -successfully loaded 1 ZeRO state_dicts for rank 24 -loading 1 zero partition checkpoints for rank 122 -successfully loaded 1 ZeRO state_dicts for rank 13 -successfully loaded 1 ZeRO state_dicts for rank 46 -successfully loaded 1 ZeRO state_dicts for rank 107 -successfully loaded 1 ZeRO state_dicts for rank 115 -successfully loaded 1 ZeRO state_dicts for rank 59 -successfully loaded 1 ZeRO state_dicts for rank 38 -successfully loaded 1 ZeRO state_dicts for rank 111 -successfully loaded 1 ZeRO state_dicts for rank 49 -successfully loaded 1 ZeRO state_dicts for rank 57 -successfully loaded 1 ZeRO state_dicts for rank 42 -successfully loaded 1 ZeRO state_dicts for rank 23 -successfully loaded 1 ZeRO state_dicts for rank 53 -successfully loaded 1 ZeRO state_dicts for rank 55 -successfully loaded 1 ZeRO state_dicts for rank 105 -successfully loaded 1 ZeRO state_dicts for rank 103 -successfully loaded 1 ZeRO state_dicts for rank 102 -successfully loaded 1 ZeRO state_dicts for rank 70 -successfully loaded 1 ZeRO state_dicts for rank 65 -successfully loaded 1 ZeRO state_dicts for rank 16 -successfully loaded 1 ZeRO state_dicts for rank 50 -loading 1 zero partition checkpoints for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 82 -successfully loaded 1 ZeRO state_dicts for rank 89 -successfully loaded 1 ZeRO state_dicts for rank 81 -successfully loaded 1 ZeRO state_dicts for rank 34 -successfully loaded 1 ZeRO state_dicts for rank 18 -successfully loaded 1 ZeRO state_dicts for rank 77 -successfully loaded 1 ZeRO state_dicts for rank 80 -successfully loaded 1 ZeRO state_dicts for rank 20 -successfully loaded 1 ZeRO state_dicts for rank 83 -successfully loaded 1 ZeRO state_dicts for rank 14 -successfully loaded 1 ZeRO state_dicts for rank 110 -loading 1 zero partition checkpoints for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 108 -successfully loaded 1 ZeRO state_dicts for rank 17 -successfully loaded 1 ZeRO state_dicts for rank 48 -successfully loaded 1 ZeRO state_dicts for rank 35 -successfully loaded 1 ZeRO state_dicts for rank 22 -successfully loaded 1 ZeRO state_dicts for rank 61 -loading 1 zero partition checkpoints for rank 64 -successfully loaded 1 ZeRO state_dicts for rank 25 -loading 1 zero partition checkpoints for rank 10 -loading 1 zero partition checkpoints for rank 113 -successfully loaded 1 ZeRO state_dicts for rank 78 -successfully loaded 1 ZeRO state_dicts for rank 19 -loading 1 zero partition checkpoints for rank 114 -loading 1 zero partition checkpoints for rank 40 -loading 1 zero partition checkpoints for rank 8 -successfully loaded 1 ZeRO state_dicts for rank 37 -loading 1 zero partition checkpoints for rank 97 -loading 1 zero partition checkpoints for rank 117 -successfully loaded 1 ZeRO state_dicts for rank 56 -successfully loaded 1 ZeRO state_dicts for rank 21 -loading 1 zero partition checkpoints for rank 45 -loading 1 zero partition checkpoints for rank 104 -successfully loaded 1 ZeRO state_dicts for rank 73 -successfully loaded 1 ZeRO state_dicts for rank 75 -loading 1 zero partition checkpoints for rank 11 -loading 1 zero partition checkpoints for rank 125 -loading 1 zero partition checkpoints for rank 32 -loading 1 zero partition checkpoints for rank 119 -loading 1 zero partition checkpoints for rank 120 -loading 1 zero partition checkpoints for rank 116 -loading 1 zero partition checkpoints for rank 112 -loading 1 zero partition checkpoints for rank 101 -successfully loaded 1 ZeRO state_dicts for rank 72 -loading 1 zero partition checkpoints for rank 118 -loading 1 zero partition checkpoints for rank 52 -loading 1 zero partition checkpoints for rank 4 -successfully loaded 1 ZeRO state_dicts for rank 36 -loading 1 zero partition checkpoints for rank 124 -loading 1 zero partition checkpoints for rank 58 -loading 1 zero partition checkpoints for rank 60 -successfully loaded 1 ZeRO state_dicts for rank 95 -successfully loaded 1 ZeRO state_dicts for rank 93 -successfully loaded 1 ZeRO state_dicts for rank 30 -loading 1 zero partition checkpoints for rank 71 -loading 1 zero partition checkpoints for rank 5 -loading 1 zero partition checkpoints for rank 84 -loading 1 zero partition checkpoints for rank 63 -successfully loaded 1 ZeRO state_dicts for rank 85 -loading 1 zero partition checkpoints for rank 26 -loading 1 zero partition checkpoints for rank 6 -loading 1 zero partition checkpoints for rank 12 -loading 1 zero partition checkpoints for rank 62 -loading 1 zero partition checkpoints for rank 123 -loading 1 zero partition checkpoints for rank 46 -loading 1 zero partition checkpoints for rank 27 -loading 1 zero partition checkpoints for rank 96 -loading 1 zero partition checkpoints for rank 9 -successfully loaded 1 ZeRO state_dicts for rank 86 -loading 1 zero partition checkpoints for rank 57 -loading 1 zero partition checkpoints for rank 49 -loading 1 zero partition checkpoints for rank 38 -loading 1 zero partition checkpoints for rank 88 -loading 1 zero partition checkpoints for rank 65 -loading 1 zero partition checkpoints for rank 55 -successfully loaded 1 ZeRO state_dicts for rank 94 -loading 1 zero partition checkpoints for rank 121 -loading 1 zero partition checkpoints for rank 103 -loading 1 zero partition checkpoints for rank 127 -loading 1 zero partition checkpoints for rank 13 -loading 1 zero partition checkpoints for rank 87 -successfully loaded 1 ZeRO state_dicts for rank 92 -loading 1 zero partition checkpoints for rank 105 -successfully loaded 1 ZeRO state_dicts for rank 29 -loading 1 zero partition checkpoints for rank 42 -successfully loaded 1 ZeRO state_dicts for rank 1 -loading 1 zero partition checkpoints for rank 91 -loading 1 zero partition checkpoints for rank 47 -loading 1 zero partition checkpoints for rank 54 -loading 1 zero partition checkpoints for rank 90 -loading 1 zero partition checkpoints for rank 34 -successfully loaded 1 ZeRO state_dicts for rank 31 -loading 1 zero partition checkpoints for rank 126 -successfully loaded 1 ZeRO state_dicts for rank 39 -successfully loaded 1 ZeRO state_dicts for rank 28 -loading 1 zero partition checkpoints for rank 77 -successfully loaded 1 ZeRO state_dicts for rank 3 -loading 1 zero partition checkpoints for rank 69 -loading 1 zero partition checkpoints for rank 43 -loading 1 zero partition checkpoints for rank 16 -loading 1 zero partition checkpoints for rank 44 -loading 1 zero partition checkpoints for rank 50 -successfully loaded 1 ZeRO state_dicts for rank 0 -loading 1 zero partition checkpoints for rank 22 -loading 1 zero partition checkpoints for rank 67 -loading 1 zero partition checkpoints for rank 82 -successfully loaded 1 ZeRO state_dicts for rank 2 -loading 1 zero partition checkpoints for rank 33 -loading 1 zero partition checkpoints for rank 66 -loading 1 zero partition checkpoints for rank 89 -loading 1 zero partition checkpoints for rank 15 -loading 1 zero partition checkpoints for rank 98 -loading 1 zero partition checkpoints for rank 83 -loading 1 zero partition checkpoints for rank 17 -loading 1 zero partition checkpoints for rank 106 -loading 1 zero partition checkpoints for rank 79 -loading 1 zero partition checkpoints for rank 51 -loading 1 zero partition checkpoints for rank 108 -loading 1 zero partition checkpoints for rank 37 -loading 1 zero partition checkpoints for rank 78 -loading 1 zero partition checkpoints for rank 41 -loading 1 zero partition checkpoints for rank 109 -loading 1 zero partition checkpoints for rank 7 -loading 1 zero partition checkpoints for rank 76 -loading 1 zero partition checkpoints for rank 24 -loading 1 zero partition checkpoints for rank 110 -loading 1 zero partition checkpoints for rank 115 -loading 1 zero partition checkpoints for rank 107 -loading 1 zero partition checkpoints for rank 59 -loading 1 zero partition checkpoints for rank 53 -loading 1 zero partition checkpoints for rank 102 -loading 1 zero partition checkpoints for rank 70 -loading 1 zero partition checkpoints for rank 111 -loading 1 zero partition checkpoints for rank 23 -loading 1 zero partition checkpoints for rank 81 -loading 1 zero partition checkpoints for rank 21 -loading 1 zero partition checkpoints for rank 18 -loading 1 zero partition checkpoints for rank 30 -loading 1 zero partition checkpoints for rank 80 -loading 1 zero partition checkpoints for rank 14 -loading 1 zero partition checkpoints for rank 20 -loading 1 zero partition checkpoints for rank 35 -loading 1 zero partition checkpoints for rank 72 -loading 1 zero partition checkpoints for rank 48 -loading 1 zero partition checkpoints for rank 61 -loading 1 zero partition checkpoints for rank 25 -loading 1 zero partition checkpoints for rank 19 -loading 1 zero partition checkpoints for rank 85 -successfully loaded 1 ZeRO state_dicts for rank 74 -loading 1 zero partition checkpoints for rank 56 -loading 1 zero partition checkpoints for rank 86 -loading 1 zero partition checkpoints for rank 29 -loading 1 zero partition checkpoints for rank 73 -loading 1 zero partition checkpoints for rank 75 -loading 1 zero partition checkpoints for rank 94 -loading 1 zero partition checkpoints for rank 36 -loading 1 zero partition checkpoints for rank 92 -loading 1 zero partition checkpoints for rank 95 -loading 1 zero partition checkpoints for rank 93 -loading 1 zero partition checkpoints for rank 3 -loading 1 zero partition checkpoints for rank 2 -loading 1 zero partition checkpoints for rank 39 -loading 1 zero partition checkpoints for rank 31 -loading 1 zero partition checkpoints for rank 28 -loading 1 zero partition checkpoints for rank 1 -loading 1 zero partition checkpoints for rank 74 -loading 1 zero partition checkpoints for rank 0 - checkpoint version 3.0 - successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints at iteration 2028 -time (ms) | load-checkpoint: 13162.09 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 125.22432 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-27 21:23:20 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 20008960 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.166940 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.359 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.292 seconds - total number of samples: 20781483 - total number of epochs: 3 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.069 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-27 21:23:26 -done with setup ... -training ... -time (ms) | model-and-optimizer-setup: 19459.46 | train/valid/test-data-iterators-setup: 5633.55 -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billionNumber of parameters: 125.22432 billionNumber of parameters: 125.22432 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.368064 billionNumber of parameters without embeddings: 103.368064 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-27 21:23:26 -[2021-10-27 21:23:26,581] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-27 21:23:26,582] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-27 21:23:26,582] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-27 21:23:26,582] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-27 21:23:26,582] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -[Rank 0] (after 2029 iterations) memory (MB) | allocated: 13205.5849609375 | max allocated: 20669.1337890625 | reserved: 24438.0 | max reserved: 24438.0 -[Rank 4] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 124] (after 2029 iterations) memory (MB) | allocated: 13102.48974609375 | max allocated: 20566.095703125 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 8] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 12] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 16] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 1] (after 2029 iterations) memory (MB) | allocated: 13201.9970703125 | max allocated: 20665.5458984375 | reserved: 24438.0 | max reserved: 24438.0 -[Rank 5] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 125] (after 2029 iterations) memory (MB) | allocated: 13102.6513671875 | max allocated: 20566.25732421875 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 9] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 13] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 17] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 24] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 32] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 25] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 20] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 28] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 21] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 36] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 29] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 44] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 33] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 40] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 41] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 37] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 45] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 48] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 53] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 52] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 49] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 57] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 56] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 64] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 60] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 68] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 69] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 72] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 65] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 76] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 61] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 73] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 80] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 77] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 81] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 84] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 85] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 88] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 93] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20084.0 | max reserved: 20084.0[Rank 92] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20084.0 | max reserved: 20084.0 - -[Rank 97] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 96] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 89] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 104] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 100] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 112] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 108] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 116] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 120] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 3] (after 2029 iterations) memory (MB) | allocated: 13201.998046875 | max allocated: 20665.546875 | reserved: 24438.0 | max reserved: 24438.0 -[Rank 6] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 126] (after 2029 iterations) memory (MB) | allocated: 13102.6513671875 | max allocated: 20566.25732421875 | reserved: 24406.0 | max reserved: 24406.0 -[Rank 2] (after 2029 iterations) memory (MB) | allocated: 13204.94287109375 | max allocated: 20668.49169921875 | reserved: 24438.0 | max reserved: 24438.0 -[Rank 7] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 105] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 19] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 11] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 10] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20106.0 | max reserved: 20106.0 -[Rank 109] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 14] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 15] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 23] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 18] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20104.0 | max reserved: 20104.0 -[Rank 27] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 22] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 26] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20102.0 | max reserved: 20102.0 -[Rank 101] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 117] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 121] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 113] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 30] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 31] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 35] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 34] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20100.0 | max reserved: 20100.0 -[Rank 38] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 39] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 42] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 43] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20098.0 | max reserved: 20098.0 -[Rank 46] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 47] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 50] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 51] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20096.0 | max reserved: 20096.0 -[Rank 54] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 55] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 59] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 58] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20094.0 | max reserved: 20094.0 -[Rank 66] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 62] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 67] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 63] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20092.0 | max reserved: 20092.0 -[Rank 74] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 75] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 70] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 71] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20090.0 | max reserved: 20090.0 -[Rank 78] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 79] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 82] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 87] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 90] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 83] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20088.0 | max reserved: 20088.0 -[Rank 86] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 94] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 99] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 95] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 91] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20086.0 | max reserved: 20086.0 -[Rank 98] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20084.0 | max reserved: 20084.0 -[Rank 102] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 103] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 107] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 110] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 106] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20082.0 | max reserved: 20082.0 -[Rank 111] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 119] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 114] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 115] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20080.0 | max reserved: 20080.0 -[Rank 118] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 123] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20078.0 | max reserved: 20078.0 -[Rank 122] (after 2029 iterations) memory (MB) | allocated: 10788.26513671875 | max allocated: 16948.44677734375 | reserved: 20078.0 | max reserved: 20078.0 - iteration 2029/ 292968 | consumed samples: 4155392 | consumed tokens: 481787904 | elapsed time per iteration (ms): 198706.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.921306E+00 | loss scale: 65536.0 | grad norm: 22809.531 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -[Rank 127] (after 2029 iterations) memory (MB) | allocated: 13101.80322265625 | max allocated: 20565.4091796875 | reserved: 24406.0 | max reserved: 24406.0 -time (ms) - iteration 2030/ 292968 | consumed samples: 4157440 | consumed tokens: 482131968 | elapsed time per iteration (ms): 130900.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.927055E+00 | loss scale: 65536.0 | grad norm: 48566.062 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2031/ 292968 | consumed samples: 4159488 | consumed tokens: 482476032 | elapsed time per iteration (ms): 130065.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.058623E+00 | loss scale: 65536.0 | grad norm: 123918.858 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2032/ 292968 | consumed samples: 4161536 | consumed tokens: 482820096 | elapsed time per iteration (ms): 129977.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.988751E+00 | loss scale: 65536.0 | grad norm: 57688.318 | num zeros: 0.0 | curriculum seqlen: 168 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2033/ 292968 | consumed samples: 4163584 | consumed tokens: 483180544 | elapsed time per iteration (ms): 132877.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.997915E+00 | loss scale: 65536.0 | grad norm: 79731.678 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2034/ 292968 | consumed samples: 4165632 | consumed tokens: 483540992 | elapsed time per iteration (ms): 131380.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 4.020886E+00 | loss scale: 65536.0 | grad norm: 76036.022 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2035/ 292968 | consumed samples: 4167680 | consumed tokens: 483901440 | elapsed time per iteration (ms): 134755.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.964027E+00 | loss scale: 65536.0 | grad norm: 44573.372 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2036/ 292968 | consumed samples: 4169728 | consumed tokens: 484261888 | elapsed time per iteration (ms): 133667.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.965054E+00 | loss scale: 65536.0 | grad norm: 51213.026 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2037/ 292968 | consumed samples: 4171776 | consumed tokens: 484622336 | elapsed time per iteration (ms): 133634.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.952868E+00 | loss scale: 65536.0 | grad norm: 45561.081 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2038/ 292968 | consumed samples: 4173824 | consumed tokens: 484982784 | elapsed time per iteration (ms): 131738.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.937924E+00 | loss scale: 65536.0 | grad norm: 43953.798 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2039/ 292968 | consumed samples: 4175872 | consumed tokens: 485343232 | elapsed time per iteration (ms): 134271.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.923034E+00 | loss scale: 65536.0 | grad norm: 30461.802 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2040/ 292968 | consumed samples: 4177920 | consumed tokens: 485703680 | elapsed time per iteration (ms): 131586.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.932657E+00 | loss scale: 65536.0 | grad norm: 32836.484 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2041/ 292968 | consumed samples: 4179968 | consumed tokens: 486064128 | elapsed time per iteration (ms): 135036.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.944592E+00 | loss scale: 65536.0 | grad norm: 32094.800 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2042/ 292968 | consumed samples: 4182016 | consumed tokens: 486424576 | elapsed time per iteration (ms): 130497.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.944293E+00 | loss scale: 65536.0 | grad norm: 27356.464 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2043/ 292968 | consumed samples: 4184064 | consumed tokens: 486785024 | elapsed time per iteration (ms): 132568.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.912325E+00 | loss scale: 65536.0 | grad norm: 23051.343 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2044/ 292968 | consumed samples: 4186112 | consumed tokens: 487145472 | elapsed time per iteration (ms): 137271.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.916307E+00 | loss scale: 65536.0 | grad norm: 23092.481 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2045/ 292968 | consumed samples: 4188160 | consumed tokens: 487505920 | elapsed time per iteration (ms): 133569.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.908511E+00 | loss scale: 65536.0 | grad norm: 29297.190 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2046/ 292968 | consumed samples: 4190208 | consumed tokens: 487866368 | elapsed time per iteration (ms): 130095.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.894101E+00 | loss scale: 65536.0 | grad norm: 33713.892 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2047/ 292968 | consumed samples: 4192256 | consumed tokens: 488226816 | elapsed time per iteration (ms): 131789.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.895768E+00 | loss scale: 65536.0 | grad norm: 26562.238 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2048/ 292968 | consumed samples: 4194304 | consumed tokens: 488587264 | elapsed time per iteration (ms): 134058.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.899483E+00 | loss scale: 65536.0 | grad norm: 22176.878 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2049/ 292968 | consumed samples: 4196352 | consumed tokens: 488947712 | elapsed time per iteration (ms): 131546.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.878782E+00 | loss scale: 65536.0 | grad norm: 26466.668 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2050/ 292968 | consumed samples: 4198400 | consumed tokens: 489308160 | elapsed time per iteration (ms): 135705.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.926190E+00 | loss scale: 65536.0 | grad norm: 23320.397 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2051/ 292968 | consumed samples: 4200448 | consumed tokens: 489668608 | elapsed time per iteration (ms): 132629.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.887108E+00 | loss scale: 65536.0 | grad norm: 20830.207 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2052/ 292968 | consumed samples: 4202496 | consumed tokens: 490029056 | elapsed time per iteration (ms): 140941.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.878243E+00 | loss scale: 65536.0 | grad norm: 22245.940 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2053/ 292968 | consumed samples: 4204544 | consumed tokens: 490389504 | elapsed time per iteration (ms): 140434.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.888543E+00 | loss scale: 65536.0 | grad norm: 22397.036 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2054/ 292968 | consumed samples: 4206592 | consumed tokens: 490749952 | elapsed time per iteration (ms): 141655.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.902360E+00 | loss scale: 65536.0 | grad norm: 24845.846 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2055/ 292968 | consumed samples: 4208640 | consumed tokens: 491110400 | elapsed time per iteration (ms): 142359.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.908053E+00 | loss scale: 65536.0 | grad norm: 27892.765 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2056/ 292968 | consumed samples: 4210688 | consumed tokens: 491470848 | elapsed time per iteration (ms): 151461.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.902268E+00 | loss scale: 65536.0 | grad norm: 30664.749 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2057/ 292968 | consumed samples: 4212736 | consumed tokens: 491831296 | elapsed time per iteration (ms): 149716.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.890077E+00 | loss scale: 65536.0 | grad norm: 28090.841 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2058/ 292968 | consumed samples: 4214784 | consumed tokens: 492191744 | elapsed time per iteration (ms): 148912.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.864818E+00 | loss scale: 65536.0 | grad norm: 24501.017 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2059/ 292968 | consumed samples: 4216832 | consumed tokens: 492552192 | elapsed time per iteration (ms): 145308.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.885553E+00 | loss scale: 65536.0 | grad norm: 31145.909 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2060/ 292968 | consumed samples: 4218880 | consumed tokens: 492912640 | elapsed time per iteration (ms): 142672.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.877625E+00 | loss scale: 65536.0 | grad norm: 27117.576 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2061/ 292968 | consumed samples: 4220928 | consumed tokens: 493273088 | elapsed time per iteration (ms): 143820.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.861409E+00 | loss scale: 65536.0 | grad norm: 24865.601 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2062/ 292968 | consumed samples: 4222976 | consumed tokens: 493633536 | elapsed time per iteration (ms): 137880.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.909456E+00 | loss scale: 65536.0 | grad norm: 34182.522 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2063/ 292968 | consumed samples: 4225024 | consumed tokens: 493993984 | elapsed time per iteration (ms): 132029.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.885704E+00 | loss scale: 65536.0 | grad norm: 40220.119 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2064/ 292968 | consumed samples: 4227072 | consumed tokens: 494354432 | elapsed time per iteration (ms): 131610.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.894105E+00 | loss scale: 65536.0 | grad norm: 37708.438 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2065/ 292968 | consumed samples: 4229120 | consumed tokens: 494714880 | elapsed time per iteration (ms): 137997.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.866132E+00 | loss scale: 65536.0 | grad norm: 31553.103 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2066/ 292968 | consumed samples: 4231168 | consumed tokens: 495075328 | elapsed time per iteration (ms): 130687.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.881151E+00 | loss scale: 65536.0 | grad norm: 23625.790 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2067/ 292968 | consumed samples: 4233216 | consumed tokens: 495435776 | elapsed time per iteration (ms): 132648.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.869434E+00 | loss scale: 65536.0 | grad norm: 41905.990 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2068/ 292968 | consumed samples: 4235264 | consumed tokens: 495796224 | elapsed time per iteration (ms): 134152.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.894929E+00 | loss scale: 65536.0 | grad norm: 40728.965 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2069/ 292968 | consumed samples: 4237312 | consumed tokens: 496156672 | elapsed time per iteration (ms): 132130.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.868305E+00 | loss scale: 65536.0 | grad norm: 33062.655 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2070/ 292968 | consumed samples: 4239360 | consumed tokens: 496517120 | elapsed time per iteration (ms): 133140.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.864267E+00 | loss scale: 65536.0 | grad norm: 25428.330 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2071/ 292968 | consumed samples: 4241408 | consumed tokens: 496877568 | elapsed time per iteration (ms): 131646.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.850497E+00 | loss scale: 65536.0 | grad norm: 24787.928 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2072/ 292968 | consumed samples: 4243456 | consumed tokens: 497238016 | elapsed time per iteration (ms): 132756.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.871679E+00 | loss scale: 65536.0 | grad norm: 24584.462 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2073/ 292968 | consumed samples: 4245504 | consumed tokens: 497598464 | elapsed time per iteration (ms): 134067.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.888788E+00 | loss scale: 65536.0 | grad norm: 23852.558 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2074/ 292968 | consumed samples: 4247552 | consumed tokens: 497958912 | elapsed time per iteration (ms): 137221.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.883694E+00 | loss scale: 65536.0 | grad norm: 26582.790 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2075/ 292968 | consumed samples: 4249600 | consumed tokens: 498319360 | elapsed time per iteration (ms): 133079.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.863066E+00 | loss scale: 65536.0 | grad norm: 32015.213 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2076/ 292968 | consumed samples: 4251648 | consumed tokens: 498679808 | elapsed time per iteration (ms): 131793.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.852857E+00 | loss scale: 65536.0 | grad norm: 29519.730 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2077/ 292968 | consumed samples: 4253696 | consumed tokens: 499040256 | elapsed time per iteration (ms): 133934.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.903611E+00 | loss scale: 65536.0 | grad norm: 24420.681 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2078/ 292968 | consumed samples: 4255744 | consumed tokens: 499400704 | elapsed time per iteration (ms): 134178.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.850241E+00 | loss scale: 65536.0 | grad norm: 25387.677 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2079/ 292968 | consumed samples: 4257792 | consumed tokens: 499761152 | elapsed time per iteration (ms): 140907.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.893193E+00 | loss scale: 65536.0 | grad norm: 29971.118 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2080/ 292968 | consumed samples: 4259840 | consumed tokens: 500121600 | elapsed time per iteration (ms): 134883.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.878091E+00 | loss scale: 65536.0 | grad norm: 39384.696 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2081/ 292968 | consumed samples: 4261888 | consumed tokens: 500482048 | elapsed time per iteration (ms): 130241.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.891078E+00 | loss scale: 65536.0 | grad norm: 35904.007 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2082/ 292968 | consumed samples: 4263936 | consumed tokens: 500842496 | elapsed time per iteration (ms): 131632.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.890465E+00 | loss scale: 65536.0 | grad norm: 26506.405 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2083/ 292968 | consumed samples: 4265984 | consumed tokens: 501202944 | elapsed time per iteration (ms): 134951.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.858867E+00 | loss scale: 65536.0 | grad norm: 22245.940 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2084/ 292968 | consumed samples: 4268032 | consumed tokens: 501563392 | elapsed time per iteration (ms): 132351.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.863218E+00 | loss scale: 65536.0 | grad norm: 27540.231 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2085/ 292968 | consumed samples: 4270080 | consumed tokens: 501923840 | elapsed time per iteration (ms): 133455.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.882363E+00 | loss scale: 65536.0 | grad norm: 32337.043 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2086/ 292968 | consumed samples: 4272128 | consumed tokens: 502284288 | elapsed time per iteration (ms): 133071.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.896693E+00 | loss scale: 65536.0 | grad norm: 32698.610 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2087/ 292968 | consumed samples: 4274176 | consumed tokens: 502644736 | elapsed time per iteration (ms): 134306.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.873858E+00 | loss scale: 65536.0 | grad norm: 33569.747 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2088/ 292968 | consumed samples: 4276224 | consumed tokens: 503005184 | elapsed time per iteration (ms): 137518.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.858442E+00 | loss scale: 65536.0 | grad norm: 24931.248 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2089/ 292968 | consumed samples: 4278272 | consumed tokens: 503365632 | elapsed time per iteration (ms): 141082.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.892706E+00 | loss scale: 65536.0 | grad norm: 25398.454 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2090/ 292968 | consumed samples: 4280320 | consumed tokens: 503726080 | elapsed time per iteration (ms): 133471.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.864743E+00 | loss scale: 65536.0 | grad norm: 21118.970 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2091/ 292968 | consumed samples: 4282368 | consumed tokens: 504086528 | elapsed time per iteration (ms): 134837.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.873662E+00 | loss scale: 65536.0 | grad norm: 29127.333 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2092/ 292968 | consumed samples: 4284416 | consumed tokens: 504446976 | elapsed time per iteration (ms): 133886.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.873102E+00 | loss scale: 65536.0 | grad norm: 28175.124 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2093/ 292968 | consumed samples: 4286464 | consumed tokens: 504807424 | elapsed time per iteration (ms): 132930.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.833543E+00 | loss scale: 65536.0 | grad norm: 18462.570 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2094/ 292968 | consumed samples: 4288512 | consumed tokens: 505167872 | elapsed time per iteration (ms): 132302.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.877876E+00 | loss scale: 65536.0 | grad norm: 33180.068 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2095/ 292968 | consumed samples: 4290560 | consumed tokens: 505528320 | elapsed time per iteration (ms): 132638.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.877420E+00 | loss scale: 65536.0 | grad norm: 40070.923 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2096/ 292968 | consumed samples: 4292608 | consumed tokens: 505888768 | elapsed time per iteration (ms): 130458.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.886416E+00 | loss scale: 65536.0 | grad norm: 32376.813 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2097/ 292968 | consumed samples: 4294656 | consumed tokens: 506249216 | elapsed time per iteration (ms): 133389.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.856274E+00 | loss scale: 65536.0 | grad norm: 27017.836 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2098/ 292968 | consumed samples: 4296704 | consumed tokens: 506609664 | elapsed time per iteration (ms): 134060.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.872075E+00 | loss scale: 65536.0 | grad norm: 24254.370 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2099/ 292968 | consumed samples: 4298752 | consumed tokens: 506970112 | elapsed time per iteration (ms): 134222.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.888329E+00 | loss scale: 65536.0 | grad norm: 24591.903 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2100/ 292968 | consumed samples: 4300800 | consumed tokens: 507330560 | elapsed time per iteration (ms): 133181.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.839217E+00 | loss scale: 65536.0 | grad norm: 27113.272 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 2100 | lm loss value: 3.873244E+00 | lm loss PPL: 4.809815E+01 | ------------------------------------------------------------------------------------------------- -saving checkpoint at iteration 2100 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-28 00:09:50,103] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/mp_rank_00_model_states.pt -[2021-10-28 00:09:50,116] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/mp_rank_01_model_states.pt -[2021-10-28 00:10:03,867] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-28 00:10:03,935] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-28 00:10:03,944] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-28 00:10:03,953] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-28 00:10:03,975] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-28 00:10:03,987] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-28 00:10:04,001] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-28 00:10:04,029] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-28 00:10:04,030] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-28 00:10:04,100] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-28 00:10:04,106] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-28 00:10:04,111] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-28 00:10:04,123] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-28 00:10:04,131] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-28 00:10:04,149] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-28 00:10:04,163] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-28 00:10:04,172] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-28 00:10:04,176] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-28 00:10:04,272] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-28 00:10:04,297] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-28 00:10:04,307] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-28 00:10:04,326] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-28 00:10:04,327] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-28 00:10:04,337] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-28 00:10:04,350] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-28 00:10:04,364] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-28 00:10:04,373] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-28 00:10:04,394] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-28 00:10:04,488] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-28 00:10:04,491] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-28 00:10:04,519] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-28 00:10:04,645] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-28 00:10:04,645] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-28 00:10:04,646] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-28 00:10:04,657] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-28 00:10:04,728] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-28 00:10:04,790] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-28 00:10:04,975] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-28 00:10:05,011] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-28 00:10:05,025] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-28 00:10:05,038] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-28 00:10:05,039] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-28 00:10:05,074] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-28 00:10:05,085] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-28 00:10:05,099] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-28 00:10:05,106] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-28 00:10:05,164] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-28 00:10:05,170] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-28 00:10:05,201] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-28 00:10:05,207] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-28 00:10:05,207] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-28 00:10:05,208] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-28 00:10:05,222] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-28 00:10:05,222] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-28 00:10:05,228] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-28 00:10:05,261] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-28 00:10:05,279] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-28 00:10:05,287] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-28 00:10:05,292] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-28 00:10:05,307] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-28 00:10:05,311] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-28 00:10:05,330] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-28 00:10:05,343] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-28 00:10:05,347] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-28 00:10:05,372] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-28 00:10:05,374] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-28 00:10:05,374] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-28 00:10:05,388] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-28 00:10:05,392] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-28 00:10:05,396] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-28 00:10:05,400] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-28 00:10:05,407] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-28 00:10:05,450] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-28 00:10:05,460] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-28 00:10:05,477] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-28 00:10:05,498] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-28 00:10:05,499] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-28 00:10:05,519] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-28 00:10:05,536] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-28 00:10:05,573] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-28 00:10:05,579] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-28 00:10:05,594] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-28 00:10:05,604] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-28 00:10:05,610] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-28 00:10:05,618] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-28 00:10:05,622] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-28 00:10:05,648] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-28 00:10:05,672] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-28 00:10:05,712] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-28 00:10:05,782] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-28 00:10:05,798] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-28 00:10:05,870] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-28 00:10:05,890] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-28 00:10:05,893] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-28 00:10:05,923] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-28 00:10:05,976] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-28 00:10:06,019] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-28 00:10:06,026] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-28 00:10:06,037] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-28 00:10:06,097] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-28 00:10:06,121] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-28 00:10:06,177] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-28 00:10:06,193] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-28 00:10:06,196] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-28 00:10:06,707] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-28 00:10:06,929] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-28 00:10:06,965] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-28 00:10:06,977] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-28 00:10:07,831] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-28 00:10:07,839] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-28 00:10:08,056] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-28 00:10:08,130] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-28 00:10:08,305] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-28 00:10:08,305] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-28 00:10:10,169] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-28 00:10:10,249] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-28 00:10:10,862] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-28 00:10:11,085] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-28 00:10:11,447] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-28 00:10:11,536] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-28 00:10:12,406] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-28 00:10:13,120] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-28 00:10:13,593] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-28 00:10:14,962] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-28 00:10:15,335] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-28 00:10:15,604] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-28 00:10:16,141] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-28 00:10:16,225] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2100/zero_pp_rank_0_mp_rank_72_optim_states.pt - successfully saved checkpoint at iteration 2100 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 28923.13 - iteration 2101/ 292968 | consumed samples: 4302848 | consumed tokens: 507691008 | elapsed time per iteration (ms): 358985.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.870862E+00 | loss scale: 65536.0 | grad norm: 29078.140 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2102/ 292968 | consumed samples: 4304896 | consumed tokens: 508051456 | elapsed time per iteration (ms): 134389.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.844451E+00 | loss scale: 65536.0 | grad norm: 35902.114 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2103/ 292968 | consumed samples: 4306944 | consumed tokens: 508411904 | elapsed time per iteration (ms): 133540.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.872329E+00 | loss scale: 65536.0 | grad norm: 34980.542 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2104/ 292968 | consumed samples: 4308992 | consumed tokens: 508772352 | elapsed time per iteration (ms): 130757.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.860693E+00 | loss scale: 65536.0 | grad norm: 25843.822 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2105/ 292968 | consumed samples: 4311040 | consumed tokens: 509132800 | elapsed time per iteration (ms): 132969.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.867200E+00 | loss scale: 65536.0 | grad norm: 32942.334 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2106/ 292968 | consumed samples: 4313088 | consumed tokens: 509493248 | elapsed time per iteration (ms): 139814.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.858380E+00 | loss scale: 65536.0 | grad norm: 42709.783 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2107/ 292968 | consumed samples: 4315136 | consumed tokens: 509853696 | elapsed time per iteration (ms): 133362.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.871891E+00 | loss scale: 65536.0 | grad norm: 48341.034 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2108/ 292968 | consumed samples: 4317184 | consumed tokens: 510214144 | elapsed time per iteration (ms): 133571.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.853979E+00 | loss scale: 65536.0 | grad norm: 36136.352 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2109/ 292968 | consumed samples: 4319232 | consumed tokens: 510574592 | elapsed time per iteration (ms): 134023.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.876661E+00 | loss scale: 65536.0 | grad norm: 23163.390 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2110/ 292968 | consumed samples: 4321280 | consumed tokens: 510935040 | elapsed time per iteration (ms): 135355.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.860840E+00 | loss scale: 65536.0 | grad norm: 24292.325 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2111/ 292968 | consumed samples: 4323328 | consumed tokens: 511295488 | elapsed time per iteration (ms): 134585.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.839595E+00 | loss scale: 65536.0 | grad norm: 27745.848 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2112/ 292968 | consumed samples: 4325376 | consumed tokens: 511655936 | elapsed time per iteration (ms): 132842.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.889898E+00 | loss scale: 65536.0 | grad norm: 32478.720 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2113/ 292968 | consumed samples: 4327424 | consumed tokens: 512016384 | elapsed time per iteration (ms): 134178.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.839876E+00 | loss scale: 65536.0 | grad norm: 34245.116 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2114/ 292968 | consumed samples: 4329472 | consumed tokens: 512376832 | elapsed time per iteration (ms): 134702.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.875976E+00 | loss scale: 65536.0 | grad norm: 24813.391 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2115/ 292968 | consumed samples: 4331520 | consumed tokens: 512737280 | elapsed time per iteration (ms): 136212.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.845379E+00 | loss scale: 65536.0 | grad norm: 20692.466 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2116/ 292968 | consumed samples: 4333568 | consumed tokens: 513097728 | elapsed time per iteration (ms): 131581.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.882514E+00 | loss scale: 65536.0 | grad norm: 31939.759 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2117/ 292968 | consumed samples: 4335616 | consumed tokens: 513458176 | elapsed time per iteration (ms): 133394.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.848691E+00 | loss scale: 65536.0 | grad norm: 35220.406 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2118/ 292968 | consumed samples: 4337664 | consumed tokens: 513818624 | elapsed time per iteration (ms): 134848.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.848073E+00 | loss scale: 65536.0 | grad norm: 24731.570 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2119/ 292968 | consumed samples: 4339712 | consumed tokens: 514179072 | elapsed time per iteration (ms): 133987.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.865925E+00 | loss scale: 65536.0 | grad norm: 44365.520 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2120/ 292968 | consumed samples: 4341760 | consumed tokens: 514539520 | elapsed time per iteration (ms): 142746.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.858386E+00 | loss scale: 65536.0 | grad norm: 38305.779 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2121/ 292968 | consumed samples: 4343808 | consumed tokens: 514899968 | elapsed time per iteration (ms): 143474.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.850712E+00 | loss scale: 65536.0 | grad norm: 29301.206 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2122/ 292968 | consumed samples: 4345856 | consumed tokens: 515260416 | elapsed time per iteration (ms): 138269.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.861865E+00 | loss scale: 65536.0 | grad norm: 32206.798 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2123/ 292968 | consumed samples: 4347904 | consumed tokens: 515620864 | elapsed time per iteration (ms): 141414.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.848934E+00 | loss scale: 65536.0 | grad norm: 21453.635 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2124/ 292968 | consumed samples: 4349952 | consumed tokens: 515981312 | elapsed time per iteration (ms): 142183.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.847641E+00 | loss scale: 65536.0 | grad norm: 22122.044 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2125/ 292968 | consumed samples: 4352000 | consumed tokens: 516341760 | elapsed time per iteration (ms): 139050.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.865095E+00 | loss scale: 65536.0 | grad norm: 22923.420 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2126/ 292968 | consumed samples: 4354048 | consumed tokens: 516702208 | elapsed time per iteration (ms): 140222.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.852949E+00 | loss scale: 65536.0 | grad norm: 26226.729 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2127/ 292968 | consumed samples: 4356096 | consumed tokens: 517062656 | elapsed time per iteration (ms): 140924.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.833083E+00 | loss scale: 65536.0 | grad norm: 22691.858 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2128/ 292968 | consumed samples: 4358144 | consumed tokens: 517423104 | elapsed time per iteration (ms): 144722.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.856811E+00 | loss scale: 65536.0 | grad norm: 23113.896 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2129/ 292968 | consumed samples: 4360192 | consumed tokens: 517783552 | elapsed time per iteration (ms): 137440.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.861891E+00 | loss scale: 65536.0 | grad norm: 21165.213 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2130/ 292968 | consumed samples: 4362240 | consumed tokens: 518144000 | elapsed time per iteration (ms): 136791.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.877660E+00 | loss scale: 65536.0 | grad norm: 19740.090 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2131/ 292968 | consumed samples: 4364288 | consumed tokens: 518504448 | elapsed time per iteration (ms): 131099.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.832720E+00 | loss scale: 65536.0 | grad norm: 31725.251 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2132/ 292968 | consumed samples: 4366336 | consumed tokens: 518864896 | elapsed time per iteration (ms): 134096.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.862016E+00 | loss scale: 65536.0 | grad norm: 34981.487 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2133/ 292968 | consumed samples: 4368384 | consumed tokens: 519225344 | elapsed time per iteration (ms): 139887.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.851338E+00 | loss scale: 65536.0 | grad norm: 30123.946 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2134/ 292968 | consumed samples: 4370432 | consumed tokens: 519585792 | elapsed time per iteration (ms): 138758.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.834562E+00 | loss scale: 65536.0 | grad norm: 33040.573 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2135/ 292968 | consumed samples: 4372480 | consumed tokens: 519946240 | elapsed time per iteration (ms): 137894.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.860944E+00 | loss scale: 65536.0 | grad norm: 36103.926 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2136/ 292968 | consumed samples: 4374528 | consumed tokens: 520306688 | elapsed time per iteration (ms): 138798.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.869861E+00 | loss scale: 65536.0 | grad norm: 34068.053 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2137/ 292968 | consumed samples: 4376576 | consumed tokens: 520667136 | elapsed time per iteration (ms): 142383.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.842812E+00 | loss scale: 65536.0 | grad norm: 30884.030 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2138/ 292968 | consumed samples: 4378624 | consumed tokens: 521027584 | elapsed time per iteration (ms): 144235.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.858552E+00 | loss scale: 65536.0 | grad norm: 29690.732 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2139/ 292968 | consumed samples: 4380672 | consumed tokens: 521388032 | elapsed time per iteration (ms): 141256.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.904191E+00 | loss scale: 65536.0 | grad norm: 35148.789 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2140/ 292968 | consumed samples: 4382720 | consumed tokens: 521748480 | elapsed time per iteration (ms): 139215.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.849405E+00 | loss scale: 65536.0 | grad norm: 40926.532 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2141/ 292968 | consumed samples: 4384768 | consumed tokens: 522108928 | elapsed time per iteration (ms): 140561.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.839933E+00 | loss scale: 65536.0 | grad norm: 35261.023 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2142/ 292968 | consumed samples: 4386816 | consumed tokens: 522469376 | elapsed time per iteration (ms): 139176.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.842124E+00 | loss scale: 65536.0 | grad norm: 30671.644 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2143/ 292968 | consumed samples: 4388864 | consumed tokens: 522829824 | elapsed time per iteration (ms): 139588.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.839497E+00 | loss scale: 65536.0 | grad norm: 25697.017 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2144/ 292968 | consumed samples: 4390912 | consumed tokens: 523190272 | elapsed time per iteration (ms): 139619.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.851575E+00 | loss scale: 65536.0 | grad norm: 29105.904 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2145/ 292968 | consumed samples: 4392960 | consumed tokens: 523550720 | elapsed time per iteration (ms): 136866.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.805974E+00 | loss scale: 65536.0 | grad norm: 30213.426 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2146/ 292968 | consumed samples: 4395008 | consumed tokens: 523911168 | elapsed time per iteration (ms): 141796.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.843766E+00 | loss scale: 65536.0 | grad norm: 28281.623 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2147/ 292968 | consumed samples: 4397056 | consumed tokens: 524271616 | elapsed time per iteration (ms): 135906.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.848487E+00 | loss scale: 65536.0 | grad norm: 40387.921 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2148/ 292968 | consumed samples: 4399104 | consumed tokens: 524632064 | elapsed time per iteration (ms): 134018.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.862791E+00 | loss scale: 65536.0 | grad norm: 36912.796 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2149/ 292968 | consumed samples: 4401152 | consumed tokens: 524992512 | elapsed time per iteration (ms): 135336.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.846223E+00 | loss scale: 65536.0 | grad norm: 27493.518 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2150/ 292968 | consumed samples: 4403200 | consumed tokens: 525352960 | elapsed time per iteration (ms): 139401.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.867390E+00 | loss scale: 65536.0 | grad norm: 31139.624 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2151/ 292968 | consumed samples: 4405248 | consumed tokens: 525713408 | elapsed time per iteration (ms): 139246.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.866351E+00 | loss scale: 65536.0 | grad norm: 34314.485 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2152/ 292968 | consumed samples: 4407296 | consumed tokens: 526073856 | elapsed time per iteration (ms): 134292.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.864145E+00 | loss scale: 65536.0 | grad norm: 37711.446 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2153/ 292968 | consumed samples: 4409344 | consumed tokens: 526434304 | elapsed time per iteration (ms): 138463.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.884693E+00 | loss scale: 65536.0 | grad norm: 35093.695 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2154/ 292968 | consumed samples: 4411392 | consumed tokens: 526794752 | elapsed time per iteration (ms): 143182.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.873112E+00 | loss scale: 65536.0 | grad norm: 34253.073 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2155/ 292968 | consumed samples: 4413440 | consumed tokens: 527155200 | elapsed time per iteration (ms): 139414.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.848808E+00 | loss scale: 65536.0 | grad norm: 34935.282 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2156/ 292968 | consumed samples: 4415488 | consumed tokens: 527515648 | elapsed time per iteration (ms): 135174.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.836921E+00 | loss scale: 65536.0 | grad norm: 28431.727 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2157/ 292968 | consumed samples: 4417536 | consumed tokens: 527876096 | elapsed time per iteration (ms): 134807.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.858356E+00 | loss scale: 65536.0 | grad norm: 37307.070 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2158/ 292968 | consumed samples: 4419584 | consumed tokens: 528236544 | elapsed time per iteration (ms): 137250.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.862726E+00 | loss scale: 65536.0 | grad norm: 28114.718 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2159/ 292968 | consumed samples: 4421632 | consumed tokens: 528596992 | elapsed time per iteration (ms): 135913.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.837076E+00 | loss scale: 65536.0 | grad norm: 28287.019 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2160/ 292968 | consumed samples: 4423680 | consumed tokens: 528957440 | elapsed time per iteration (ms): 137999.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.846355E+00 | loss scale: 65536.0 | grad norm: 28401.556 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2161/ 292968 | consumed samples: 4425728 | consumed tokens: 529317888 | elapsed time per iteration (ms): 139138.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.874059E+00 | loss scale: 65536.0 | grad norm: 29920.076 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2162/ 292968 | consumed samples: 4427776 | consumed tokens: 529678336 | elapsed time per iteration (ms): 136927.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.825433E+00 | loss scale: 65536.0 | grad norm: 30484.668 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2163/ 292968 | consumed samples: 4429824 | consumed tokens: 530038784 | elapsed time per iteration (ms): 139419.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.851492E+00 | loss scale: 65536.0 | grad norm: 26570.843 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2164/ 292968 | consumed samples: 4431872 | consumed tokens: 530399232 | elapsed time per iteration (ms): 139419.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.857165E+00 | loss scale: 65536.0 | grad norm: 25158.638 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2165/ 292968 | consumed samples: 4433920 | consumed tokens: 530759680 | elapsed time per iteration (ms): 134243.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.828908E+00 | loss scale: 65536.0 | grad norm: 34657.058 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2166/ 292968 | consumed samples: 4435968 | consumed tokens: 531120128 | elapsed time per iteration (ms): 136790.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.847366E+00 | loss scale: 65536.0 | grad norm: 28687.983 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2167/ 292968 | consumed samples: 4438016 | consumed tokens: 531480576 | elapsed time per iteration (ms): 137090.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.844301E+00 | loss scale: 65536.0 | grad norm: 35855.397 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2168/ 292968 | consumed samples: 4440064 | consumed tokens: 531841024 | elapsed time per iteration (ms): 138048.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.849959E+00 | loss scale: 65536.0 | grad norm: 34380.620 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2169/ 292968 | consumed samples: 4442112 | consumed tokens: 532201472 | elapsed time per iteration (ms): 139543.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.822693E+00 | loss scale: 65536.0 | grad norm: 28190.794 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2170/ 292968 | consumed samples: 4444160 | consumed tokens: 532561920 | elapsed time per iteration (ms): 137820.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.842875E+00 | loss scale: 65536.0 | grad norm: 24235.953 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2171/ 292968 | consumed samples: 4446208 | consumed tokens: 532922368 | elapsed time per iteration (ms): 134480.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.825045E+00 | loss scale: 65536.0 | grad norm: 22239.272 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2172/ 292968 | consumed samples: 4448256 | consumed tokens: 533282816 | elapsed time per iteration (ms): 138952.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.851873E+00 | loss scale: 65536.0 | grad norm: 21024.807 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2173/ 292968 | consumed samples: 4450304 | consumed tokens: 533643264 | elapsed time per iteration (ms): 141985.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.833343E+00 | loss scale: 65536.0 | grad norm: 28346.520 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2174/ 292968 | consumed samples: 4452352 | consumed tokens: 534003712 | elapsed time per iteration (ms): 138424.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.826918E+00 | loss scale: 65536.0 | grad norm: 32804.300 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2175/ 292968 | consumed samples: 4454400 | consumed tokens: 534364160 | elapsed time per iteration (ms): 141850.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.833400E+00 | loss scale: 65536.0 | grad norm: 25035.656 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2176/ 292968 | consumed samples: 4456448 | consumed tokens: 534724608 | elapsed time per iteration (ms): 139037.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.829341E+00 | loss scale: 65536.0 | grad norm: 21140.652 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2177/ 292968 | consumed samples: 4458496 | consumed tokens: 535085056 | elapsed time per iteration (ms): 134395.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.843111E+00 | loss scale: 65536.0 | grad norm: 25612.766 | num zeros: 0.0 | curriculum seqlen: 176 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2178/ 292968 | consumed samples: 4460544 | consumed tokens: 535461888 | elapsed time per iteration (ms): 138364.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.841423E+00 | loss scale: 65536.0 | grad norm: 33509.596 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2179/ 292968 | consumed samples: 4462592 | consumed tokens: 535838720 | elapsed time per iteration (ms): 145633.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.843890E+00 | loss scale: 65536.0 | grad norm: 36153.635 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2180/ 292968 | consumed samples: 4464640 | consumed tokens: 536215552 | elapsed time per iteration (ms): 138884.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.848327E+00 | loss scale: 65536.0 | grad norm: 37309.403 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2181/ 292968 | consumed samples: 4466688 | consumed tokens: 536592384 | elapsed time per iteration (ms): 140272.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.833440E+00 | loss scale: 65536.0 | grad norm: 30115.284 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2182/ 292968 | consumed samples: 4468736 | consumed tokens: 536969216 | elapsed time per iteration (ms): 136000.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.872945E+00 | loss scale: 65536.0 | grad norm: 22487.474 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2183/ 292968 | consumed samples: 4470784 | consumed tokens: 537346048 | elapsed time per iteration (ms): 136832.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.824508E+00 | loss scale: 65536.0 | grad norm: 32067.001 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2184/ 292968 | consumed samples: 4472832 | consumed tokens: 537722880 | elapsed time per iteration (ms): 134931.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.819839E+00 | loss scale: 65536.0 | grad norm: 40516.028 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2185/ 292968 | consumed samples: 4474880 | consumed tokens: 538099712 | elapsed time per iteration (ms): 133860.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.851592E+00 | loss scale: 65536.0 | grad norm: 45162.716 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2186/ 292968 | consumed samples: 4476928 | consumed tokens: 538476544 | elapsed time per iteration (ms): 133313.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.839691E+00 | loss scale: 65536.0 | grad norm: 31727.848 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2187/ 292968 | consumed samples: 4478976 | consumed tokens: 538853376 | elapsed time per iteration (ms): 140967.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.831524E+00 | loss scale: 65536.0 | grad norm: 30595.515 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2188/ 292968 | consumed samples: 4481024 | consumed tokens: 539230208 | elapsed time per iteration (ms): 138095.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.820030E+00 | loss scale: 65536.0 | grad norm: 36570.081 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2189/ 292968 | consumed samples: 4483072 | consumed tokens: 539607040 | elapsed time per iteration (ms): 135450.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.828967E+00 | loss scale: 65536.0 | grad norm: 36690.180 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2190/ 292968 | consumed samples: 4485120 | consumed tokens: 539983872 | elapsed time per iteration (ms): 137339.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.836876E+00 | loss scale: 65536.0 | grad norm: 37025.685 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2191/ 292968 | consumed samples: 4487168 | consumed tokens: 540360704 | elapsed time per iteration (ms): 136415.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.837189E+00 | loss scale: 65536.0 | grad norm: 25872.051 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2192/ 292968 | consumed samples: 4489216 | consumed tokens: 540737536 | elapsed time per iteration (ms): 136278.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.827043E+00 | loss scale: 65536.0 | grad norm: 20014.426 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2193/ 292968 | consumed samples: 4491264 | consumed tokens: 541114368 | elapsed time per iteration (ms): 135851.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.843652E+00 | loss scale: 65536.0 | grad norm: 18372.606 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2194/ 292968 | consumed samples: 4493312 | consumed tokens: 541491200 | elapsed time per iteration (ms): 137966.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.831741E+00 | loss scale: 65536.0 | grad norm: 22932.245 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2195/ 292968 | consumed samples: 4495360 | consumed tokens: 541868032 | elapsed time per iteration (ms): 136062.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.826357E+00 | loss scale: 65536.0 | grad norm: 21499.185 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2196/ 292968 | consumed samples: 4497408 | consumed tokens: 542244864 | elapsed time per iteration (ms): 140471.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.842843E+00 | loss scale: 65536.0 | grad norm: 21394.868 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2197/ 292968 | consumed samples: 4499456 | consumed tokens: 542621696 | elapsed time per iteration (ms): 136502.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.818826E+00 | loss scale: 65536.0 | grad norm: 22258.567 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2198/ 292968 | consumed samples: 4501504 | consumed tokens: 542998528 | elapsed time per iteration (ms): 141539.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.795285E+00 | loss scale: 65536.0 | grad norm: 23089.525 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2199/ 292968 | consumed samples: 4503552 | consumed tokens: 543375360 | elapsed time per iteration (ms): 136384.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.837844E+00 | loss scale: 65536.0 | grad norm: 24989.849 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2200/ 292968 | consumed samples: 4505600 | consumed tokens: 543752192 | elapsed time per iteration (ms): 143850.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.832821E+00 | loss scale: 65536.0 | grad norm: 31261.280 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2201/ 292968 | consumed samples: 4507648 | consumed tokens: 544129024 | elapsed time per iteration (ms): 134570.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.837898E+00 | loss scale: 65536.0 | grad norm: 34445.754 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2202/ 292968 | consumed samples: 4509696 | consumed tokens: 544505856 | elapsed time per iteration (ms): 135920.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.807807E+00 | loss scale: 65536.0 | grad norm: 30823.658 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2203/ 292968 | consumed samples: 4511744 | consumed tokens: 544882688 | elapsed time per iteration (ms): 135931.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.831536E+00 | loss scale: 65536.0 | grad norm: 21538.582 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2204/ 292968 | consumed samples: 4513792 | consumed tokens: 545259520 | elapsed time per iteration (ms): 142476.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.799485E+00 | loss scale: 65536.0 | grad norm: 21519.075 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2205/ 292968 | consumed samples: 4515840 | consumed tokens: 545636352 | elapsed time per iteration (ms): 136774.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.845019E+00 | loss scale: 65536.0 | grad norm: 28274.625 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2206/ 292968 | consumed samples: 4517888 | consumed tokens: 546013184 | elapsed time per iteration (ms): 137682.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.791939E+00 | loss scale: 65536.0 | grad norm: 28644.422 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2207/ 292968 | consumed samples: 4519936 | consumed tokens: 546390016 | elapsed time per iteration (ms): 143158.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.815455E+00 | loss scale: 65536.0 | grad norm: 24295.779 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2208/ 292968 | consumed samples: 4521984 | consumed tokens: 546766848 | elapsed time per iteration (ms): 136986.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.826050E+00 | loss scale: 65536.0 | grad norm: 18847.154 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2209/ 292968 | consumed samples: 4524032 | consumed tokens: 547143680 | elapsed time per iteration (ms): 140054.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.829313E+00 | loss scale: 65536.0 | grad norm: 25345.133 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2210/ 292968 | consumed samples: 4526080 | consumed tokens: 547520512 | elapsed time per iteration (ms): 137577.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.839798E+00 | loss scale: 65536.0 | grad norm: 32603.320 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2211/ 292968 | consumed samples: 4528128 | consumed tokens: 547897344 | elapsed time per iteration (ms): 143992.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.843069E+00 | loss scale: 65536.0 | grad norm: 35578.434 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2212/ 292968 | consumed samples: 4530176 | consumed tokens: 548274176 | elapsed time per iteration (ms): 144104.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.840665E+00 | loss scale: 65536.0 | grad norm: 25356.813 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2213/ 292968 | consumed samples: 4532224 | consumed tokens: 548651008 | elapsed time per iteration (ms): 139900.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.842168E+00 | loss scale: 65536.0 | grad norm: 33859.413 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2214/ 292968 | consumed samples: 4534272 | consumed tokens: 549027840 | elapsed time per iteration (ms): 138699.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.846391E+00 | loss scale: 65536.0 | grad norm: 52266.075 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2215/ 292968 | consumed samples: 4536320 | consumed tokens: 549404672 | elapsed time per iteration (ms): 135259.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.836273E+00 | loss scale: 65536.0 | grad norm: 43709.973 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2216/ 292968 | consumed samples: 4538368 | consumed tokens: 549781504 | elapsed time per iteration (ms): 139901.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.814698E+00 | loss scale: 65536.0 | grad norm: 29965.403 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2217/ 292968 | consumed samples: 4540416 | consumed tokens: 550158336 | elapsed time per iteration (ms): 135176.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.807659E+00 | loss scale: 65536.0 | grad norm: 23978.557 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2218/ 292968 | consumed samples: 4542464 | consumed tokens: 550535168 | elapsed time per iteration (ms): 142360.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.849300E+00 | loss scale: 65536.0 | grad norm: 31721.137 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2219/ 292968 | consumed samples: 4544512 | consumed tokens: 550912000 | elapsed time per iteration (ms): 135992.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.788159E+00 | loss scale: 65536.0 | grad norm: 34727.303 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2220/ 292968 | consumed samples: 4546560 | consumed tokens: 551288832 | elapsed time per iteration (ms): 133508.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.844147E+00 | loss scale: 65536.0 | grad norm: 32845.375 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2221/ 292968 | consumed samples: 4548608 | consumed tokens: 551665664 | elapsed time per iteration (ms): 135960.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.822625E+00 | loss scale: 65536.0 | grad norm: 33321.727 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2222/ 292968 | consumed samples: 4550656 | consumed tokens: 552042496 | elapsed time per iteration (ms): 135628.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.804032E+00 | loss scale: 65536.0 | grad norm: 26830.939 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2223/ 292968 | consumed samples: 4552704 | consumed tokens: 552419328 | elapsed time per iteration (ms): 136251.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.820960E+00 | loss scale: 65536.0 | grad norm: 20352.140 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2224/ 292968 | consumed samples: 4554752 | consumed tokens: 552796160 | elapsed time per iteration (ms): 133403.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.826795E+00 | loss scale: 65536.0 | grad norm: 19436.062 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2225/ 292968 | consumed samples: 4556800 | consumed tokens: 553172992 | elapsed time per iteration (ms): 137249.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.825352E+00 | loss scale: 65536.0 | grad norm: 22237.349 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2226/ 292968 | consumed samples: 4558848 | consumed tokens: 553549824 | elapsed time per iteration (ms): 136232.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.788841E+00 | loss scale: 65536.0 | grad norm: 29513.293 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2227/ 292968 | consumed samples: 4560896 | consumed tokens: 553926656 | elapsed time per iteration (ms): 133300.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.785101E+00 | loss scale: 65536.0 | grad norm: 32172.255 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2228/ 292968 | consumed samples: 4562944 | consumed tokens: 554303488 | elapsed time per iteration (ms): 136300.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.807305E+00 | loss scale: 65536.0 | grad norm: 24996.854 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2229/ 292968 | consumed samples: 4564992 | consumed tokens: 554680320 | elapsed time per iteration (ms): 135755.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.815479E+00 | loss scale: 65536.0 | grad norm: 20479.202 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2230/ 292968 | consumed samples: 4567040 | consumed tokens: 555057152 | elapsed time per iteration (ms): 136590.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.781206E+00 | loss scale: 65536.0 | grad norm: 21568.774 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2231/ 292968 | consumed samples: 4569088 | consumed tokens: 555433984 | elapsed time per iteration (ms): 132910.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.807284E+00 | loss scale: 65536.0 | grad norm: 21393.127 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2232/ 292968 | consumed samples: 4571136 | consumed tokens: 555810816 | elapsed time per iteration (ms): 135324.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.816277E+00 | loss scale: 65536.0 | grad norm: 18909.535 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2233/ 292968 | consumed samples: 4573184 | consumed tokens: 556187648 | elapsed time per iteration (ms): 139469.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.813556E+00 | loss scale: 65536.0 | grad norm: 23902.361 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2234/ 292968 | consumed samples: 4575232 | consumed tokens: 556564480 | elapsed time per iteration (ms): 144752.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.801430E+00 | loss scale: 65536.0 | grad norm: 26250.504 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2235/ 292968 | consumed samples: 4577280 | consumed tokens: 556941312 | elapsed time per iteration (ms): 130663.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.805100E+00 | loss scale: 65536.0 | grad norm: 23305.657 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2236/ 292968 | consumed samples: 4579328 | consumed tokens: 557318144 | elapsed time per iteration (ms): 133457.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.817347E+00 | loss scale: 65536.0 | grad norm: 27712.330 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2237/ 292968 | consumed samples: 4581376 | consumed tokens: 557694976 | elapsed time per iteration (ms): 139150.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.834468E+00 | loss scale: 65536.0 | grad norm: 24032.719 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2238/ 292968 | consumed samples: 4583424 | consumed tokens: 558071808 | elapsed time per iteration (ms): 139156.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.818223E+00 | loss scale: 65536.0 | grad norm: 22896.273 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2239/ 292968 | consumed samples: 4585472 | consumed tokens: 558448640 | elapsed time per iteration (ms): 139591.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.810268E+00 | loss scale: 65536.0 | grad norm: 20830.159 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2240/ 292968 | consumed samples: 4587520 | consumed tokens: 558825472 | elapsed time per iteration (ms): 135684.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.764055E+00 | loss scale: 65536.0 | grad norm: 23932.211 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2241/ 292968 | consumed samples: 4589568 | consumed tokens: 559202304 | elapsed time per iteration (ms): 140587.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.823497E+00 | loss scale: 65536.0 | grad norm: 25174.077 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2242/ 292968 | consumed samples: 4591616 | consumed tokens: 559579136 | elapsed time per iteration (ms): 135055.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.811233E+00 | loss scale: 65536.0 | grad norm: 23612.720 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2243/ 292968 | consumed samples: 4593664 | consumed tokens: 559955968 | elapsed time per iteration (ms): 140211.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.807998E+00 | loss scale: 65536.0 | grad norm: 24478.097 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2244/ 292968 | consumed samples: 4595712 | consumed tokens: 560332800 | elapsed time per iteration (ms): 136785.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.815568E+00 | loss scale: 65536.0 | grad norm: 23424.276 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2245/ 292968 | consumed samples: 4597760 | consumed tokens: 560709632 | elapsed time per iteration (ms): 134983.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.814304E+00 | loss scale: 65536.0 | grad norm: 23262.633 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2246/ 292968 | consumed samples: 4599808 | consumed tokens: 561086464 | elapsed time per iteration (ms): 138699.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.815813E+00 | loss scale: 65536.0 | grad norm: 26031.122 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2247/ 292968 | consumed samples: 4601856 | consumed tokens: 561463296 | elapsed time per iteration (ms): 136605.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.810181E+00 | loss scale: 65536.0 | grad norm: 26599.949 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2248/ 292968 | consumed samples: 4603904 | consumed tokens: 561840128 | elapsed time per iteration (ms): 135014.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.784707E+00 | loss scale: 65536.0 | grad norm: 26219.710 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2249/ 292968 | consumed samples: 4605952 | consumed tokens: 562216960 | elapsed time per iteration (ms): 135712.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.794142E+00 | loss scale: 65536.0 | grad norm: 24770.745 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2250/ 292968 | consumed samples: 4608000 | consumed tokens: 562593792 | elapsed time per iteration (ms): 136166.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.824245E+00 | loss scale: 65536.0 | grad norm: 36288.851 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 2250 | lm loss value: 3.775213E+00 | lm loss PPL: 4.360677E+01 | ------------------------------------------------------------------------------------------------- - iteration 2251/ 292968 | consumed samples: 4610048 | consumed tokens: 562970624 | elapsed time per iteration (ms): 326690.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.806533E+00 | loss scale: 65536.0 | grad norm: 33756.481 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2252/ 292968 | consumed samples: 4612096 | consumed tokens: 563347456 | elapsed time per iteration (ms): 141230.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.811161E+00 | loss scale: 65536.0 | grad norm: 24801.789 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2253/ 292968 | consumed samples: 4614144 | consumed tokens: 563724288 | elapsed time per iteration (ms): 145639.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.786703E+00 | loss scale: 65536.0 | grad norm: 29821.757 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2254/ 292968 | consumed samples: 4616192 | consumed tokens: 564101120 | elapsed time per iteration (ms): 137905.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.796359E+00 | loss scale: 65536.0 | grad norm: 40182.463 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2255/ 292968 | consumed samples: 4618240 | consumed tokens: 564477952 | elapsed time per iteration (ms): 140385.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.818343E+00 | loss scale: 65536.0 | grad norm: 47921.489 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2256/ 292968 | consumed samples: 4620288 | consumed tokens: 564854784 | elapsed time per iteration (ms): 140168.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.799305E+00 | loss scale: 65536.0 | grad norm: 50646.272 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2257/ 292968 | consumed samples: 4622336 | consumed tokens: 565231616 | elapsed time per iteration (ms): 136899.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.819925E+00 | loss scale: 65536.0 | grad norm: 42274.388 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2258/ 292968 | consumed samples: 4624384 | consumed tokens: 565608448 | elapsed time per iteration (ms): 139876.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.792223E+00 | loss scale: 65536.0 | grad norm: 25585.279 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2259/ 292968 | consumed samples: 4626432 | consumed tokens: 565985280 | elapsed time per iteration (ms): 142265.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.785164E+00 | loss scale: 65536.0 | grad norm: 28981.313 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2260/ 292968 | consumed samples: 4628480 | consumed tokens: 566362112 | elapsed time per iteration (ms): 141387.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.807217E+00 | loss scale: 65536.0 | grad norm: 24336.025 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2261/ 292968 | consumed samples: 4630528 | consumed tokens: 566738944 | elapsed time per iteration (ms): 140643.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.777678E+00 | loss scale: 65536.0 | grad norm: 21379.086 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2262/ 292968 | consumed samples: 4632576 | consumed tokens: 567115776 | elapsed time per iteration (ms): 141717.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.811987E+00 | loss scale: 65536.0 | grad norm: 33357.636 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2263/ 292968 | consumed samples: 4634624 | consumed tokens: 567492608 | elapsed time per iteration (ms): 137260.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.771559E+00 | loss scale: 65536.0 | grad norm: 37039.941 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2264/ 292968 | consumed samples: 4636672 | consumed tokens: 567869440 | elapsed time per iteration (ms): 140447.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.793582E+00 | loss scale: 65536.0 | grad norm: 23047.859 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2265/ 292968 | consumed samples: 4638720 | consumed tokens: 568246272 | elapsed time per iteration (ms): 137563.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.800085E+00 | loss scale: 65536.0 | grad norm: 32445.851 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2266/ 292968 | consumed samples: 4640768 | consumed tokens: 568623104 | elapsed time per iteration (ms): 142938.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.790187E+00 | loss scale: 65536.0 | grad norm: 34655.067 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2267/ 292968 | consumed samples: 4642816 | consumed tokens: 568999936 | elapsed time per iteration (ms): 139471.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.765433E+00 | loss scale: 65536.0 | grad norm: 22815.377 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2268/ 292968 | consumed samples: 4644864 | consumed tokens: 569376768 | elapsed time per iteration (ms): 141285.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.818110E+00 | loss scale: 65536.0 | grad norm: 24715.741 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2269/ 292968 | consumed samples: 4646912 | consumed tokens: 569753600 | elapsed time per iteration (ms): 140618.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.793754E+00 | loss scale: 65536.0 | grad norm: 22623.976 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2270/ 292968 | consumed samples: 4648960 | consumed tokens: 570130432 | elapsed time per iteration (ms): 137337.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.789413E+00 | loss scale: 65536.0 | grad norm: 25993.892 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2271/ 292968 | consumed samples: 4651008 | consumed tokens: 570507264 | elapsed time per iteration (ms): 142470.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.820823E+00 | loss scale: 65536.0 | grad norm: 29721.733 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2272/ 292968 | consumed samples: 4653056 | consumed tokens: 570884096 | elapsed time per iteration (ms): 137439.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.810469E+00 | loss scale: 65536.0 | grad norm: 26717.253 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2273/ 292968 | consumed samples: 4655104 | consumed tokens: 571260928 | elapsed time per iteration (ms): 137585.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.824480E+00 | loss scale: 65536.0 | grad norm: 25618.248 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2274/ 292968 | consumed samples: 4657152 | consumed tokens: 571637760 | elapsed time per iteration (ms): 140752.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.779830E+00 | loss scale: 65536.0 | grad norm: 31146.127 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2275/ 292968 | consumed samples: 4659200 | consumed tokens: 572014592 | elapsed time per iteration (ms): 138998.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.829388E+00 | loss scale: 65536.0 | grad norm: 28305.447 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2276/ 292968 | consumed samples: 4661248 | consumed tokens: 572391424 | elapsed time per iteration (ms): 141605.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.811153E+00 | loss scale: 65536.0 | grad norm: 23938.940 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2277/ 292968 | consumed samples: 4663296 | consumed tokens: 572768256 | elapsed time per iteration (ms): 140503.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.807840E+00 | loss scale: 65536.0 | grad norm: 26012.881 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2278/ 292968 | consumed samples: 4665344 | consumed tokens: 573145088 | elapsed time per iteration (ms): 135897.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.807946E+00 | loss scale: 65536.0 | grad norm: 21709.369 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2279/ 292968 | consumed samples: 4667392 | consumed tokens: 573521920 | elapsed time per iteration (ms): 138821.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.794024E+00 | loss scale: 65536.0 | grad norm: 20685.876 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2280/ 292968 | consumed samples: 4669440 | consumed tokens: 573898752 | elapsed time per iteration (ms): 139995.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.791292E+00 | loss scale: 65536.0 | grad norm: 27346.858 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2281/ 292968 | consumed samples: 4671488 | consumed tokens: 574275584 | elapsed time per iteration (ms): 139387.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.821162E+00 | loss scale: 65536.0 | grad norm: 29844.815 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2282/ 292968 | consumed samples: 4673536 | consumed tokens: 574652416 | elapsed time per iteration (ms): 136855.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.814487E+00 | loss scale: 65536.0 | grad norm: 26154.094 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2283/ 292968 | consumed samples: 4675584 | consumed tokens: 575029248 | elapsed time per iteration (ms): 136847.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.781634E+00 | loss scale: 65536.0 | grad norm: 26652.646 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2284/ 292968 | consumed samples: 4677632 | consumed tokens: 575406080 | elapsed time per iteration (ms): 140407.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.780706E+00 | loss scale: 65536.0 | grad norm: 26434.768 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2285/ 292968 | consumed samples: 4679680 | consumed tokens: 575782912 | elapsed time per iteration (ms): 141612.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.809583E+00 | loss scale: 65536.0 | grad norm: 19780.523 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2286/ 292968 | consumed samples: 4681728 | consumed tokens: 576159744 | elapsed time per iteration (ms): 139812.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.816165E+00 | loss scale: 65536.0 | grad norm: 27252.961 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2287/ 292968 | consumed samples: 4683776 | consumed tokens: 576536576 | elapsed time per iteration (ms): 142777.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.800237E+00 | loss scale: 65536.0 | grad norm: 31182.752 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2288/ 292968 | consumed samples: 4685824 | consumed tokens: 576913408 | elapsed time per iteration (ms): 141301.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.793540E+00 | loss scale: 65536.0 | grad norm: 27490.708 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2289/ 292968 | consumed samples: 4687872 | consumed tokens: 577290240 | elapsed time per iteration (ms): 138535.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.805202E+00 | loss scale: 65536.0 | grad norm: 29329.756 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2290/ 292968 | consumed samples: 4689920 | consumed tokens: 577667072 | elapsed time per iteration (ms): 136100.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.773138E+00 | loss scale: 65536.0 | grad norm: 31366.846 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2291/ 292968 | consumed samples: 4691968 | consumed tokens: 578043904 | elapsed time per iteration (ms): 137839.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.772840E+00 | loss scale: 65536.0 | grad norm: 25611.072 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2292/ 292968 | consumed samples: 4694016 | consumed tokens: 578420736 | elapsed time per iteration (ms): 138835.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.788571E+00 | loss scale: 65536.0 | grad norm: 28408.388 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2293/ 292968 | consumed samples: 4696064 | consumed tokens: 578797568 | elapsed time per iteration (ms): 148713.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.799145E+00 | loss scale: 65536.0 | grad norm: 26271.720 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2294/ 292968 | consumed samples: 4698112 | consumed tokens: 579174400 | elapsed time per iteration (ms): 136491.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.786582E+00 | loss scale: 65536.0 | grad norm: 24008.702 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2295/ 292968 | consumed samples: 4700160 | consumed tokens: 579551232 | elapsed time per iteration (ms): 138925.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.795625E+00 | loss scale: 65536.0 | grad norm: 34949.553 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2296/ 292968 | consumed samples: 4702208 | consumed tokens: 579928064 | elapsed time per iteration (ms): 139916.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.797352E+00 | loss scale: 65536.0 | grad norm: 31041.237 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2297/ 292968 | consumed samples: 4704256 | consumed tokens: 580304896 | elapsed time per iteration (ms): 136124.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.799598E+00 | loss scale: 65536.0 | grad norm: 24966.905 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2298/ 292968 | consumed samples: 4706304 | consumed tokens: 580681728 | elapsed time per iteration (ms): 136939.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.806225E+00 | loss scale: 65536.0 | grad norm: 29186.431 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2299/ 292968 | consumed samples: 4708352 | consumed tokens: 581058560 | elapsed time per iteration (ms): 135712.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.813261E+00 | loss scale: 65536.0 | grad norm: 25879.669 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2300/ 292968 | consumed samples: 4710400 | consumed tokens: 581435392 | elapsed time per iteration (ms): 137200.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.808423E+00 | loss scale: 65536.0 | grad norm: 24835.979 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2301/ 292968 | consumed samples: 4712448 | consumed tokens: 581812224 | elapsed time per iteration (ms): 135023.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.788499E+00 | loss scale: 65536.0 | grad norm: 29203.944 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2302/ 292968 | consumed samples: 4714496 | consumed tokens: 582189056 | elapsed time per iteration (ms): 136988.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.786563E+00 | loss scale: 65536.0 | grad norm: 26620.103 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2303/ 292968 | consumed samples: 4716544 | consumed tokens: 582565888 | elapsed time per iteration (ms): 137065.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.787659E+00 | loss scale: 65536.0 | grad norm: 26205.626 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2304/ 292968 | consumed samples: 4718592 | consumed tokens: 582942720 | elapsed time per iteration (ms): 137510.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.789042E+00 | loss scale: 65536.0 | grad norm: 24379.418 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2305/ 292968 | consumed samples: 4720640 | consumed tokens: 583319552 | elapsed time per iteration (ms): 134648.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.779527E+00 | loss scale: 65536.0 | grad norm: 23296.891 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2306/ 292968 | consumed samples: 4722688 | consumed tokens: 583696384 | elapsed time per iteration (ms): 139119.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.793372E+00 | loss scale: 65536.0 | grad norm: 20663.923 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2307/ 292968 | consumed samples: 4724736 | consumed tokens: 584073216 | elapsed time per iteration (ms): 137816.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.785955E+00 | loss scale: 65536.0 | grad norm: 20223.829 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2308/ 292968 | consumed samples: 4726784 | consumed tokens: 584450048 | elapsed time per iteration (ms): 137972.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.792054E+00 | loss scale: 65536.0 | grad norm: 25428.691 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2309/ 292968 | consumed samples: 4728832 | consumed tokens: 584826880 | elapsed time per iteration (ms): 137829.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.808517E+00 | loss scale: 65536.0 | grad norm: 24437.671 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2310/ 292968 | consumed samples: 4730880 | consumed tokens: 585203712 | elapsed time per iteration (ms): 137234.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.779271E+00 | loss scale: 65536.0 | grad norm: 24025.526 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2311/ 292968 | consumed samples: 4732928 | consumed tokens: 585580544 | elapsed time per iteration (ms): 137278.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.780249E+00 | loss scale: 65536.0 | grad norm: 32621.760 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2312/ 292968 | consumed samples: 4734976 | consumed tokens: 585957376 | elapsed time per iteration (ms): 137327.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.793823E+00 | loss scale: 65536.0 | grad norm: 48605.272 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2313/ 292968 | consumed samples: 4737024 | consumed tokens: 586334208 | elapsed time per iteration (ms): 141719.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.783066E+00 | loss scale: 65536.0 | grad norm: 57589.739 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2314/ 292968 | consumed samples: 4739072 | consumed tokens: 586711040 | elapsed time per iteration (ms): 135607.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.806716E+00 | loss scale: 65536.0 | grad norm: 36614.681 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2315/ 292968 | consumed samples: 4741120 | consumed tokens: 587087872 | elapsed time per iteration (ms): 137042.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.799241E+00 | loss scale: 65536.0 | grad norm: 32383.958 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2316/ 292968 | consumed samples: 4743168 | consumed tokens: 587464704 | elapsed time per iteration (ms): 136829.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.794459E+00 | loss scale: 65536.0 | grad norm: 40591.784 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2317/ 292968 | consumed samples: 4745216 | consumed tokens: 587841536 | elapsed time per iteration (ms): 138734.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.801472E+00 | loss scale: 65536.0 | grad norm: 19971.896 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2318/ 292968 | consumed samples: 4747264 | consumed tokens: 588218368 | elapsed time per iteration (ms): 131311.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.765103E+00 | loss scale: 65536.0 | grad norm: 25669.053 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2319/ 292968 | consumed samples: 4749312 | consumed tokens: 588595200 | elapsed time per iteration (ms): 135166.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.788629E+00 | loss scale: 65536.0 | grad norm: 22846.928 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2320/ 292968 | consumed samples: 4751360 | consumed tokens: 588972032 | elapsed time per iteration (ms): 132467.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.767619E+00 | loss scale: 65536.0 | grad norm: 25921.588 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2321/ 292968 | consumed samples: 4753408 | consumed tokens: 589348864 | elapsed time per iteration (ms): 136980.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.787257E+00 | loss scale: 65536.0 | grad norm: 23623.056 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2322/ 292968 | consumed samples: 4755456 | consumed tokens: 589725696 | elapsed time per iteration (ms): 133893.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.772480E+00 | loss scale: 65536.0 | grad norm: 20524.785 | num zeros: 0.0 | curriculum seqlen: 184 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2323/ 292968 | consumed samples: 4757504 | consumed tokens: 590118912 | elapsed time per iteration (ms): 135045.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.788449E+00 | loss scale: 65536.0 | grad norm: 27241.998 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2324/ 292968 | consumed samples: 4759552 | consumed tokens: 590512128 | elapsed time per iteration (ms): 148159.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.792671E+00 | loss scale: 65536.0 | grad norm: 24644.995 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2325/ 292968 | consumed samples: 4761600 | consumed tokens: 590905344 | elapsed time per iteration (ms): 139395.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.806061E+00 | loss scale: 65536.0 | grad norm: 22456.145 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2326/ 292968 | consumed samples: 4763648 | consumed tokens: 591298560 | elapsed time per iteration (ms): 135513.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.799061E+00 | loss scale: 65536.0 | grad norm: 21644.013 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2327/ 292968 | consumed samples: 4765696 | consumed tokens: 591691776 | elapsed time per iteration (ms): 134171.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.805687E+00 | loss scale: 65536.0 | grad norm: 35135.261 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2328/ 292968 | consumed samples: 4767744 | consumed tokens: 592084992 | elapsed time per iteration (ms): 132534.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.778266E+00 | loss scale: 65536.0 | grad norm: 37433.986 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2329/ 292968 | consumed samples: 4769792 | consumed tokens: 592478208 | elapsed time per iteration (ms): 135285.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.761718E+00 | loss scale: 65536.0 | grad norm: 31271.460 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2330/ 292968 | consumed samples: 4771840 | consumed tokens: 592871424 | elapsed time per iteration (ms): 132396.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.780925E+00 | loss scale: 65536.0 | grad norm: 21112.305 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2331/ 292968 | consumed samples: 4773888 | consumed tokens: 593264640 | elapsed time per iteration (ms): 132459.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.782613E+00 | loss scale: 65536.0 | grad norm: 26511.431 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2332/ 292968 | consumed samples: 4775936 | consumed tokens: 593657856 | elapsed time per iteration (ms): 134361.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.776858E+00 | loss scale: 65536.0 | grad norm: 24936.702 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2333/ 292968 | consumed samples: 4777984 | consumed tokens: 594051072 | elapsed time per iteration (ms): 131513.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.768156E+00 | loss scale: 65536.0 | grad norm: 19928.109 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2334/ 292968 | consumed samples: 4780032 | consumed tokens: 594444288 | elapsed time per iteration (ms): 134431.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.745637E+00 | loss scale: 65536.0 | grad norm: 22991.672 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2335/ 292968 | consumed samples: 4782080 | consumed tokens: 594837504 | elapsed time per iteration (ms): 133580.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.763217E+00 | loss scale: 65536.0 | grad norm: 25147.072 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2336/ 292968 | consumed samples: 4784128 | consumed tokens: 595230720 | elapsed time per iteration (ms): 133768.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.754317E+00 | loss scale: 65536.0 | grad norm: 20827.620 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2337/ 292968 | consumed samples: 4786176 | consumed tokens: 595623936 | elapsed time per iteration (ms): 132071.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.784703E+00 | loss scale: 65536.0 | grad norm: 18218.925 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2338/ 292968 | consumed samples: 4788224 | consumed tokens: 596017152 | elapsed time per iteration (ms): 130887.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.753055E+00 | loss scale: 65536.0 | grad norm: 21434.371 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2339/ 292968 | consumed samples: 4790272 | consumed tokens: 596410368 | elapsed time per iteration (ms): 129340.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.779580E+00 | loss scale: 65536.0 | grad norm: 25850.227 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2340/ 292968 | consumed samples: 4792320 | consumed tokens: 596803584 | elapsed time per iteration (ms): 134221.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.780405E+00 | loss scale: 65536.0 | grad norm: 28309.382 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2341/ 292968 | consumed samples: 4794368 | consumed tokens: 597196800 | elapsed time per iteration (ms): 129570.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.760766E+00 | loss scale: 65536.0 | grad norm: 27654.518 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2342/ 292968 | consumed samples: 4796416 | consumed tokens: 597590016 | elapsed time per iteration (ms): 128293.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.788010E+00 | loss scale: 65536.0 | grad norm: 30290.789 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2343/ 292968 | consumed samples: 4798464 | consumed tokens: 597983232 | elapsed time per iteration (ms): 132659.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.784873E+00 | loss scale: 65536.0 | grad norm: 32591.535 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2344/ 292968 | consumed samples: 4800512 | consumed tokens: 598376448 | elapsed time per iteration (ms): 130786.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.754043E+00 | loss scale: 65536.0 | grad norm: 25787.292 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2345/ 292968 | consumed samples: 4802560 | consumed tokens: 598769664 | elapsed time per iteration (ms): 131357.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.772280E+00 | loss scale: 65536.0 | grad norm: 21884.519 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2346/ 292968 | consumed samples: 4804608 | consumed tokens: 599162880 | elapsed time per iteration (ms): 131659.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.752135E+00 | loss scale: 65536.0 | grad norm: 31742.024 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2347/ 292968 | consumed samples: 4806656 | consumed tokens: 599556096 | elapsed time per iteration (ms): 130322.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.764308E+00 | loss scale: 65536.0 | grad norm: 27857.290 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2348/ 292968 | consumed samples: 4808704 | consumed tokens: 599949312 | elapsed time per iteration (ms): 132210.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.778388E+00 | loss scale: 65536.0 | grad norm: 22433.483 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2349/ 292968 | consumed samples: 4810752 | consumed tokens: 600342528 | elapsed time per iteration (ms): 133927.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.735103E+00 | loss scale: 65536.0 | grad norm: 22820.581 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2350/ 292968 | consumed samples: 4812800 | consumed tokens: 600735744 | elapsed time per iteration (ms): 135096.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.767247E+00 | loss scale: 65536.0 | grad norm: 22409.510 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2351/ 292968 | consumed samples: 4814848 | consumed tokens: 601128960 | elapsed time per iteration (ms): 133915.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.719773E+00 | loss scale: 65536.0 | grad norm: 28507.919 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2352/ 292968 | consumed samples: 4816896 | consumed tokens: 601522176 | elapsed time per iteration (ms): 135086.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.770896E+00 | loss scale: 65536.0 | grad norm: 29331.382 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2353/ 292968 | consumed samples: 4818944 | consumed tokens: 601915392 | elapsed time per iteration (ms): 135051.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.781532E+00 | loss scale: 65536.0 | grad norm: 33674.006 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2354/ 292968 | consumed samples: 4820992 | consumed tokens: 602308608 | elapsed time per iteration (ms): 133474.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.745422E+00 | loss scale: 65536.0 | grad norm: 36771.636 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2355/ 292968 | consumed samples: 4823040 | consumed tokens: 602701824 | elapsed time per iteration (ms): 137793.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.784761E+00 | loss scale: 65536.0 | grad norm: 38694.169 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2356/ 292968 | consumed samples: 4825088 | consumed tokens: 603095040 | elapsed time per iteration (ms): 133397.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.776262E+00 | loss scale: 65536.0 | grad norm: 38071.456 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2357/ 292968 | consumed samples: 4827136 | consumed tokens: 603488256 | elapsed time per iteration (ms): 133550.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.772212E+00 | loss scale: 65536.0 | grad norm: 35912.771 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2358/ 292968 | consumed samples: 4829184 | consumed tokens: 603881472 | elapsed time per iteration (ms): 132853.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.749775E+00 | loss scale: 65536.0 | grad norm: 28438.350 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2359/ 292968 | consumed samples: 4831232 | consumed tokens: 604274688 | elapsed time per iteration (ms): 133635.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.751421E+00 | loss scale: 65536.0 | grad norm: 21949.278 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2360/ 292968 | consumed samples: 4833280 | consumed tokens: 604667904 | elapsed time per iteration (ms): 129228.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.774833E+00 | loss scale: 65536.0 | grad norm: 28985.203 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2361/ 292968 | consumed samples: 4835328 | consumed tokens: 605061120 | elapsed time per iteration (ms): 133651.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.746464E+00 | loss scale: 65536.0 | grad norm: 28614.124 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2362/ 292968 | consumed samples: 4837376 | consumed tokens: 605454336 | elapsed time per iteration (ms): 134017.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.783215E+00 | loss scale: 65536.0 | grad norm: 25154.116 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2363/ 292968 | consumed samples: 4839424 | consumed tokens: 605847552 | elapsed time per iteration (ms): 135552.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.800763E+00 | loss scale: 65536.0 | grad norm: 20262.004 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2364/ 292968 | consumed samples: 4841472 | consumed tokens: 606240768 | elapsed time per iteration (ms): 132395.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.760957E+00 | loss scale: 65536.0 | grad norm: 24952.141 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2365/ 292968 | consumed samples: 4843520 | consumed tokens: 606633984 | elapsed time per iteration (ms): 134797.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.755268E+00 | loss scale: 65536.0 | grad norm: 27040.585 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2366/ 292968 | consumed samples: 4845568 | consumed tokens: 607027200 | elapsed time per iteration (ms): 131560.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.736548E+00 | loss scale: 65536.0 | grad norm: 23313.335 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2367/ 292968 | consumed samples: 4847616 | consumed tokens: 607420416 | elapsed time per iteration (ms): 136168.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.749416E+00 | loss scale: 65536.0 | grad norm: 30906.371 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2368/ 292968 | consumed samples: 4849664 | consumed tokens: 607813632 | elapsed time per iteration (ms): 138545.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.778869E+00 | loss scale: 65536.0 | grad norm: 39416.143 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2369/ 292968 | consumed samples: 4851712 | consumed tokens: 608206848 | elapsed time per iteration (ms): 134698.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.782683E+00 | loss scale: 65536.0 | grad norm: 30192.879 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2370/ 292968 | consumed samples: 4853760 | consumed tokens: 608600064 | elapsed time per iteration (ms): 136030.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.742288E+00 | loss scale: 65536.0 | grad norm: 22264.819 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2371/ 292968 | consumed samples: 4855808 | consumed tokens: 608993280 | elapsed time per iteration (ms): 135941.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.756645E+00 | loss scale: 65536.0 | grad norm: 34413.442 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2372/ 292968 | consumed samples: 4857856 | consumed tokens: 609386496 | elapsed time per iteration (ms): 136463.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.760472E+00 | loss scale: 65536.0 | grad norm: 30478.607 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2373/ 292968 | consumed samples: 4859904 | consumed tokens: 609779712 | elapsed time per iteration (ms): 134292.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.743210E+00 | loss scale: 65536.0 | grad norm: 22133.657 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2374/ 292968 | consumed samples: 4861952 | consumed tokens: 610172928 | elapsed time per iteration (ms): 135442.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.734931E+00 | loss scale: 65536.0 | grad norm: 22980.148 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2375/ 292968 | consumed samples: 4864000 | consumed tokens: 610566144 | elapsed time per iteration (ms): 131389.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.755670E+00 | loss scale: 65536.0 | grad norm: 29361.898 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2376/ 292968 | consumed samples: 4866048 | consumed tokens: 610959360 | elapsed time per iteration (ms): 134247.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.770886E+00 | loss scale: 65536.0 | grad norm: 26666.341 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2377/ 292968 | consumed samples: 4868096 | consumed tokens: 611352576 | elapsed time per iteration (ms): 131788.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.747808E+00 | loss scale: 65536.0 | grad norm: 20872.050 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2378/ 292968 | consumed samples: 4870144 | consumed tokens: 611745792 | elapsed time per iteration (ms): 137302.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.732145E+00 | loss scale: 65536.0 | grad norm: 25373.620 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2379/ 292968 | consumed samples: 4872192 | consumed tokens: 612139008 | elapsed time per iteration (ms): 138154.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.759680E+00 | loss scale: 65536.0 | grad norm: 25515.008 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2380/ 292968 | consumed samples: 4874240 | consumed tokens: 612532224 | elapsed time per iteration (ms): 141649.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.751046E+00 | loss scale: 65536.0 | grad norm: 22492.927 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2381/ 292968 | consumed samples: 4876288 | consumed tokens: 612925440 | elapsed time per iteration (ms): 141868.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.744042E+00 | loss scale: 65536.0 | grad norm: 25279.445 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2382/ 292968 | consumed samples: 4878336 | consumed tokens: 613318656 | elapsed time per iteration (ms): 143144.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.746763E+00 | loss scale: 65536.0 | grad norm: 24352.975 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2383/ 292968 | consumed samples: 4880384 | consumed tokens: 613711872 | elapsed time per iteration (ms): 139923.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.757558E+00 | loss scale: 65536.0 | grad norm: 22244.628 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2384/ 292968 | consumed samples: 4882432 | consumed tokens: 614105088 | elapsed time per iteration (ms): 139047.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.751666E+00 | loss scale: 65536.0 | grad norm: 22457.497 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2385/ 292968 | consumed samples: 4884480 | consumed tokens: 614498304 | elapsed time per iteration (ms): 148010.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.749250E+00 | loss scale: 65536.0 | grad norm: 28766.214 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2386/ 292968 | consumed samples: 4886528 | consumed tokens: 614891520 | elapsed time per iteration (ms): 142768.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.738948E+00 | loss scale: 65536.0 | grad norm: 36129.565 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2387/ 292968 | consumed samples: 4888576 | consumed tokens: 615284736 | elapsed time per iteration (ms): 142625.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.758482E+00 | loss scale: 65536.0 | grad norm: 31602.790 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2388/ 292968 | consumed samples: 4890624 | consumed tokens: 615677952 | elapsed time per iteration (ms): 145632.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.759584E+00 | loss scale: 65536.0 | grad norm: 23627.824 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2389/ 292968 | consumed samples: 4892672 | consumed tokens: 616071168 | elapsed time per iteration (ms): 142656.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.750874E+00 | loss scale: 65536.0 | grad norm: 27804.200 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2390/ 292968 | consumed samples: 4894720 | consumed tokens: 616464384 | elapsed time per iteration (ms): 146247.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.753480E+00 | loss scale: 65536.0 | grad norm: 31930.807 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2391/ 292968 | consumed samples: 4896768 | consumed tokens: 616857600 | elapsed time per iteration (ms): 153497.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.764813E+00 | loss scale: 65536.0 | grad norm: 32544.504 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2392/ 292968 | consumed samples: 4898816 | consumed tokens: 617250816 | elapsed time per iteration (ms): 155263.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.777633E+00 | loss scale: 65536.0 | grad norm: 27552.753 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2393/ 292968 | consumed samples: 4900864 | consumed tokens: 617644032 | elapsed time per iteration (ms): 161738.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.744725E+00 | loss scale: 65536.0 | grad norm: 25779.390 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2394/ 292968 | consumed samples: 4902912 | consumed tokens: 618037248 | elapsed time per iteration (ms): 155523.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.743825E+00 | loss scale: 65536.0 | grad norm: 24728.800 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2395/ 292968 | consumed samples: 4904960 | consumed tokens: 618430464 | elapsed time per iteration (ms): 158291.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.761549E+00 | loss scale: 65536.0 | grad norm: 26398.802 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2396/ 292968 | consumed samples: 4907008 | consumed tokens: 618823680 | elapsed time per iteration (ms): 151622.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.754889E+00 | loss scale: 65536.0 | grad norm: 20309.675 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2397/ 292968 | consumed samples: 4909056 | consumed tokens: 619216896 | elapsed time per iteration (ms): 148322.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.745060E+00 | loss scale: 65536.0 | grad norm: 19216.524 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2398/ 292968 | consumed samples: 4911104 | consumed tokens: 619610112 | elapsed time per iteration (ms): 142957.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.738704E+00 | loss scale: 65536.0 | grad norm: 20710.235 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2399/ 292968 | consumed samples: 4913152 | consumed tokens: 620003328 | elapsed time per iteration (ms): 138041.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.722674E+00 | loss scale: 65536.0 | grad norm: 21790.376 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2400/ 292968 | consumed samples: 4915200 | consumed tokens: 620396544 | elapsed time per iteration (ms): 138313.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.755920E+00 | loss scale: 65536.0 | grad norm: 22091.482 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 2400 | lm loss value: 3.732941E+00 | lm loss PPL: 4.180188E+01 | ------------------------------------------------------------------------------------------------- -saving checkpoint at iteration 2400 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-28 11:45:53,417] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/mp_rank_01_model_states.pt -[2021-10-28 11:45:53,448] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/mp_rank_00_model_states.pt -[2021-10-28 11:46:07,163] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-28 11:46:07,181] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-28 11:46:07,188] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-28 11:46:07,240] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-28 11:46:07,282] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-28 11:46:07,330] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-28 11:46:07,349] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-28 11:46:07,420] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-28 11:46:07,433] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-28 11:46:07,449] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-28 11:46:07,450] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-28 11:46:07,475] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-28 11:46:07,510] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-28 11:46:07,535] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-28 11:46:07,561] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-28 11:46:07,569] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-28 11:46:07,574] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-28 11:46:07,578] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-28 11:46:07,602] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-28 11:46:07,605] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-28 11:46:07,617] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-28 11:46:07,619] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-28 11:46:07,656] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-28 11:46:07,679] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-28 11:46:07,717] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-28 11:46:07,722] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-28 11:46:07,725] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-28 11:46:07,812] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-28 11:46:07,965] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-28 11:46:08,012] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-28 11:46:08,019] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-28 11:46:08,093] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-28 11:46:08,134] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-28 11:46:08,168] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-28 11:46:08,224] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-28 11:46:08,278] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-28 11:46:08,305] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-28 11:46:08,345] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-28 11:46:08,356] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-28 11:46:08,396] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-28 11:46:08,397] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-28 11:46:08,401] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-28 11:46:08,421] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-28 11:46:08,444] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-28 11:46:08,448] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-28 11:46:08,451] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-28 11:46:08,455] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-28 11:46:08,467] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-28 11:46:08,479] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-28 11:46:08,491] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-28 11:46:08,496] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-28 11:46:08,505] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-28 11:46:08,511] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-28 11:46:08,514] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-28 11:46:08,544] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-28 11:46:08,557] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-28 11:46:08,567] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-28 11:46:08,595] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-28 11:46:08,607] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-28 11:46:08,607] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-28 11:46:08,612] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-28 11:46:08,638] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-28 11:46:08,641] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-28 11:46:08,642] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-28 11:46:08,645] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-28 11:46:08,678] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-28 11:46:08,687] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-28 11:46:08,689] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-28 11:46:08,709] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-28 11:46:08,709] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-28 11:46:08,724] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-28 11:46:08,736] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-28 11:46:08,756] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-28 11:46:08,759] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-28 11:46:08,790] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-28 11:46:08,832] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-28 11:46:08,838] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-28 11:46:08,839] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-28 11:46:08,840] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-28 11:46:08,851] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-28 11:46:08,862] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-28 11:46:08,872] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-28 11:46:08,909] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-28 11:46:08,909] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-28 11:46:08,917] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-28 11:46:08,925] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-28 11:46:09,031] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-28 11:46:09,056] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-28 11:46:09,074] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-28 11:46:09,086] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-28 11:46:09,158] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-28 11:46:09,160] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-28 11:46:09,188] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-28 11:46:09,236] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-28 11:46:09,281] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-28 11:46:09,326] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-28 11:46:09,341] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-28 11:46:09,353] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-28 11:46:09,365] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-28 11:46:09,438] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-28 11:46:09,497] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-28 11:46:09,512] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-28 11:46:09,529] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-28 11:46:09,649] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-28 11:46:10,072] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-28 11:46:10,077] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-28 11:46:10,496] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-28 11:46:10,619] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-28 11:46:11,292] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-28 11:46:11,351] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-28 11:46:11,427] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-28 11:46:11,582] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-28 11:46:11,916] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-28 11:46:11,964] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-28 11:46:14,103] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-28 11:46:14,194] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-28 11:46:14,317] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-28 11:46:14,384] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-28 11:46:14,586] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-28 11:46:15,195] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-28 11:46:15,332] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-28 11:46:15,659] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-28 11:46:18,548] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-28 11:46:18,605] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-28 11:46:18,701] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-28 11:46:19,864] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-28 11:46:20,103] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-28 11:46:20,189] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2400/zero_pp_rank_0_mp_rank_93_optim_states.pt - successfully saved checkpoint at iteration 2400 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 29466.13 - iteration 2401/ 292968 | consumed samples: 4917248 | consumed tokens: 620789760 | elapsed time per iteration (ms): 389600.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.747741E+00 | loss scale: 65536.0 | grad norm: 26495.810 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2402/ 292968 | consumed samples: 4919296 | consumed tokens: 621182976 | elapsed time per iteration (ms): 139733.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.768186E+00 | loss scale: 65536.0 | grad norm: 30283.279 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2403/ 292968 | consumed samples: 4921344 | consumed tokens: 621576192 | elapsed time per iteration (ms): 137917.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.747903E+00 | loss scale: 65536.0 | grad norm: 33256.531 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2404/ 292968 | consumed samples: 4923392 | consumed tokens: 621969408 | elapsed time per iteration (ms): 132643.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.725107E+00 | loss scale: 65536.0 | grad norm: 27498.528 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2405/ 292968 | consumed samples: 4925440 | consumed tokens: 622362624 | elapsed time per iteration (ms): 135291.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.732228E+00 | loss scale: 65536.0 | grad norm: 22690.989 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2406/ 292968 | consumed samples: 4927488 | consumed tokens: 622755840 | elapsed time per iteration (ms): 132697.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.766750E+00 | loss scale: 65536.0 | grad norm: 22504.007 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2407/ 292968 | consumed samples: 4929536 | consumed tokens: 623149056 | elapsed time per iteration (ms): 127868.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.756174E+00 | loss scale: 65536.0 | grad norm: 22109.567 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2408/ 292968 | consumed samples: 4931584 | consumed tokens: 623542272 | elapsed time per iteration (ms): 136008.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.757931E+00 | loss scale: 65536.0 | grad norm: 25119.328 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2409/ 292968 | consumed samples: 4933632 | consumed tokens: 623935488 | elapsed time per iteration (ms): 132963.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.770503E+00 | loss scale: 65536.0 | grad norm: 25499.052 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2410/ 292968 | consumed samples: 4935680 | consumed tokens: 624328704 | elapsed time per iteration (ms): 134996.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.737018E+00 | loss scale: 65536.0 | grad norm: 24813.249 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2411/ 292968 | consumed samples: 4937728 | consumed tokens: 624721920 | elapsed time per iteration (ms): 130434.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.734081E+00 | loss scale: 65536.0 | grad norm: 21494.091 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2412/ 292968 | consumed samples: 4939776 | consumed tokens: 625115136 | elapsed time per iteration (ms): 133080.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.741505E+00 | loss scale: 65536.0 | grad norm: 22602.883 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2413/ 292968 | consumed samples: 4941824 | consumed tokens: 625508352 | elapsed time per iteration (ms): 133810.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.728782E+00 | loss scale: 65536.0 | grad norm: 26104.552 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2414/ 292968 | consumed samples: 4943872 | consumed tokens: 625901568 | elapsed time per iteration (ms): 136678.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.775312E+00 | loss scale: 65536.0 | grad norm: 29615.744 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2415/ 292968 | consumed samples: 4945920 | consumed tokens: 626294784 | elapsed time per iteration (ms): 132548.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.740290E+00 | loss scale: 65536.0 | grad norm: 25451.444 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2416/ 292968 | consumed samples: 4947968 | consumed tokens: 626688000 | elapsed time per iteration (ms): 133511.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.726371E+00 | loss scale: 65536.0 | grad norm: 28264.315 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2417/ 292968 | consumed samples: 4950016 | consumed tokens: 627081216 | elapsed time per iteration (ms): 132083.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.721961E+00 | loss scale: 65536.0 | grad norm: 29741.225 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2418/ 292968 | consumed samples: 4952064 | consumed tokens: 627474432 | elapsed time per iteration (ms): 131482.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.761437E+00 | loss scale: 65536.0 | grad norm: 26898.014 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2419/ 292968 | consumed samples: 4954112 | consumed tokens: 627867648 | elapsed time per iteration (ms): 134100.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.761405E+00 | loss scale: 65536.0 | grad norm: 21923.326 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2420/ 292968 | consumed samples: 4956160 | consumed tokens: 628260864 | elapsed time per iteration (ms): 132686.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.721832E+00 | loss scale: 65536.0 | grad norm: 22127.807 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2421/ 292968 | consumed samples: 4958208 | consumed tokens: 628654080 | elapsed time per iteration (ms): 132906.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.746765E+00 | loss scale: 65536.0 | grad norm: 24288.739 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2422/ 292968 | consumed samples: 4960256 | consumed tokens: 629047296 | elapsed time per iteration (ms): 135476.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.728182E+00 | loss scale: 65536.0 | grad norm: 26470.967 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2423/ 292968 | consumed samples: 4962304 | consumed tokens: 629440512 | elapsed time per iteration (ms): 135252.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.758277E+00 | loss scale: 65536.0 | grad norm: 26878.141 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2424/ 292968 | consumed samples: 4964352 | consumed tokens: 629833728 | elapsed time per iteration (ms): 134049.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.748378E+00 | loss scale: 65536.0 | grad norm: 27122.145 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2425/ 292968 | consumed samples: 4966400 | consumed tokens: 630226944 | elapsed time per iteration (ms): 133339.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.758863E+00 | loss scale: 65536.0 | grad norm: 31570.752 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2426/ 292968 | consumed samples: 4968448 | consumed tokens: 630620160 | elapsed time per iteration (ms): 134764.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.758523E+00 | loss scale: 65536.0 | grad norm: 33591.693 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2427/ 292968 | consumed samples: 4970496 | consumed tokens: 631013376 | elapsed time per iteration (ms): 132065.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.740614E+00 | loss scale: 65536.0 | grad norm: 31826.028 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2428/ 292968 | consumed samples: 4972544 | consumed tokens: 631406592 | elapsed time per iteration (ms): 136572.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.732567E+00 | loss scale: 65536.0 | grad norm: 24212.420 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2429/ 292968 | consumed samples: 4974592 | consumed tokens: 631799808 | elapsed time per iteration (ms): 133143.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.713341E+00 | loss scale: 65536.0 | grad norm: 22545.007 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2430/ 292968 | consumed samples: 4976640 | consumed tokens: 632193024 | elapsed time per iteration (ms): 131966.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.741250E+00 | loss scale: 65536.0 | grad norm: 21819.611 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2431/ 292968 | consumed samples: 4978688 | consumed tokens: 632586240 | elapsed time per iteration (ms): 133402.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.728469E+00 | loss scale: 65536.0 | grad norm: 21772.010 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2432/ 292968 | consumed samples: 4980736 | consumed tokens: 632979456 | elapsed time per iteration (ms): 130857.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.725074E+00 | loss scale: 65536.0 | grad norm: 27122.247 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2433/ 292968 | consumed samples: 4982784 | consumed tokens: 633372672 | elapsed time per iteration (ms): 131341.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.729767E+00 | loss scale: 65536.0 | grad norm: 30216.996 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2434/ 292968 | consumed samples: 4984832 | consumed tokens: 633765888 | elapsed time per iteration (ms): 131747.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.760504E+00 | loss scale: 65536.0 | grad norm: 29498.500 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2435/ 292968 | consumed samples: 4986880 | consumed tokens: 634159104 | elapsed time per iteration (ms): 135048.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.727262E+00 | loss scale: 65536.0 | grad norm: 31598.271 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2436/ 292968 | consumed samples: 4988928 | consumed tokens: 634552320 | elapsed time per iteration (ms): 128506.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.750138E+00 | loss scale: 65536.0 | grad norm: 29919.695 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2437/ 292968 | consumed samples: 4990976 | consumed tokens: 634945536 | elapsed time per iteration (ms): 130761.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.732839E+00 | loss scale: 65536.0 | grad norm: 34393.472 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2438/ 292968 | consumed samples: 4993024 | consumed tokens: 635338752 | elapsed time per iteration (ms): 134689.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.725207E+00 | loss scale: 65536.0 | grad norm: 29643.351 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2439/ 292968 | consumed samples: 4995072 | consumed tokens: 635731968 | elapsed time per iteration (ms): 130798.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.711891E+00 | loss scale: 65536.0 | grad norm: 23500.834 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2440/ 292968 | consumed samples: 4997120 | consumed tokens: 636125184 | elapsed time per iteration (ms): 133784.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.724564E+00 | loss scale: 65536.0 | grad norm: 31704.168 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2441/ 292968 | consumed samples: 4999168 | consumed tokens: 636518400 | elapsed time per iteration (ms): 132418.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.728829E+00 | loss scale: 65536.0 | grad norm: 21623.264 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2442/ 292968 | consumed samples: 5001216 | consumed tokens: 636911616 | elapsed time per iteration (ms): 133167.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.710727E+00 | loss scale: 65536.0 | grad norm: 22489.520 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2443/ 292968 | consumed samples: 5003264 | consumed tokens: 637304832 | elapsed time per iteration (ms): 137277.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.720677E+00 | loss scale: 65536.0 | grad norm: 32994.694 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2444/ 292968 | consumed samples: 5005312 | consumed tokens: 637698048 | elapsed time per iteration (ms): 133287.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.712157E+00 | loss scale: 65536.0 | grad norm: 28978.252 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2445/ 292968 | consumed samples: 5007360 | consumed tokens: 638091264 | elapsed time per iteration (ms): 133519.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.730226E+00 | loss scale: 65536.0 | grad norm: 22006.576 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2446/ 292968 | consumed samples: 5009408 | consumed tokens: 638484480 | elapsed time per iteration (ms): 133181.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.712153E+00 | loss scale: 65536.0 | grad norm: 29674.270 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2447/ 292968 | consumed samples: 5011456 | consumed tokens: 638877696 | elapsed time per iteration (ms): 131008.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.743332E+00 | loss scale: 65536.0 | grad norm: 22249.960 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2448/ 292968 | consumed samples: 5013504 | consumed tokens: 639270912 | elapsed time per iteration (ms): 133562.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.762426E+00 | loss scale: 65536.0 | grad norm: 22020.212 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2449/ 292968 | consumed samples: 5015552 | consumed tokens: 639664128 | elapsed time per iteration (ms): 137081.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.747466E+00 | loss scale: 65536.0 | grad norm: 22249.013 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2450/ 292968 | consumed samples: 5017600 | consumed tokens: 640057344 | elapsed time per iteration (ms): 142745.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.721703E+00 | loss scale: 65536.0 | grad norm: 25344.554 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2451/ 292968 | consumed samples: 5019648 | consumed tokens: 640450560 | elapsed time per iteration (ms): 139427.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.760006E+00 | loss scale: 65536.0 | grad norm: 25096.528 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2452/ 292968 | consumed samples: 5021696 | consumed tokens: 640843776 | elapsed time per iteration (ms): 138934.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.732952E+00 | loss scale: 65536.0 | grad norm: 26808.542 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2453/ 292968 | consumed samples: 5023744 | consumed tokens: 641236992 | elapsed time per iteration (ms): 140657.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.720376E+00 | loss scale: 65536.0 | grad norm: 22105.993 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2454/ 292968 | consumed samples: 5025792 | consumed tokens: 641630208 | elapsed time per iteration (ms): 144797.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.750073E+00 | loss scale: 65536.0 | grad norm: 20974.329 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2455/ 292968 | consumed samples: 5027840 | consumed tokens: 642023424 | elapsed time per iteration (ms): 139091.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.757902E+00 | loss scale: 65536.0 | grad norm: 21184.311 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2456/ 292968 | consumed samples: 5029888 | consumed tokens: 642416640 | elapsed time per iteration (ms): 134384.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.727026E+00 | loss scale: 65536.0 | grad norm: 20926.311 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2457/ 292968 | consumed samples: 5031936 | consumed tokens: 642809856 | elapsed time per iteration (ms): 134824.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.711959E+00 | loss scale: 65536.0 | grad norm: 22816.394 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2458/ 292968 | consumed samples: 5033984 | consumed tokens: 643203072 | elapsed time per iteration (ms): 134604.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.738233E+00 | loss scale: 65536.0 | grad norm: 23015.289 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2459/ 292968 | consumed samples: 5036032 | consumed tokens: 643596288 | elapsed time per iteration (ms): 137086.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.735792E+00 | loss scale: 65536.0 | grad norm: 33238.455 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2460/ 292968 | consumed samples: 5038080 | consumed tokens: 643989504 | elapsed time per iteration (ms): 130497.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.715335E+00 | loss scale: 65536.0 | grad norm: 41181.484 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2461/ 292968 | consumed samples: 5040128 | consumed tokens: 644382720 | elapsed time per iteration (ms): 133193.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.745535E+00 | loss scale: 65536.0 | grad norm: 35854.893 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2462/ 292968 | consumed samples: 5042176 | consumed tokens: 644775936 | elapsed time per iteration (ms): 130068.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.708353E+00 | loss scale: 65536.0 | grad norm: 24747.437 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2463/ 292968 | consumed samples: 5044224 | consumed tokens: 645169152 | elapsed time per iteration (ms): 131603.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.726259E+00 | loss scale: 65536.0 | grad norm: 17465.817 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2464/ 292968 | consumed samples: 5046272 | consumed tokens: 645562368 | elapsed time per iteration (ms): 133874.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.701224E+00 | loss scale: 65536.0 | grad norm: 25728.393 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2465/ 292968 | consumed samples: 5048320 | consumed tokens: 645955584 | elapsed time per iteration (ms): 136657.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.710899E+00 | loss scale: 65536.0 | grad norm: 32368.997 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2466/ 292968 | consumed samples: 5050368 | consumed tokens: 646348800 | elapsed time per iteration (ms): 135186.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.710097E+00 | loss scale: 65536.0 | grad norm: 26449.180 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2467/ 292968 | consumed samples: 5052416 | consumed tokens: 646742016 | elapsed time per iteration (ms): 134385.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.751218E+00 | loss scale: 65536.0 | grad norm: 19546.871 | num zeros: 0.0 | curriculum seqlen: 192 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2468/ 292968 | consumed samples: 5054464 | consumed tokens: 647151616 | elapsed time per iteration (ms): 133478.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.760462E+00 | loss scale: 65536.0 | grad norm: 23737.642 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2469/ 292968 | consumed samples: 5056512 | consumed tokens: 647561216 | elapsed time per iteration (ms): 132559.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.731978E+00 | loss scale: 65536.0 | grad norm: 24375.937 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2470/ 292968 | consumed samples: 5058560 | consumed tokens: 647970816 | elapsed time per iteration (ms): 139871.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.731634E+00 | loss scale: 65536.0 | grad norm: 25231.234 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2471/ 292968 | consumed samples: 5060608 | consumed tokens: 648380416 | elapsed time per iteration (ms): 130375.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.719555E+00 | loss scale: 65536.0 | grad norm: 26948.788 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2472/ 292968 | consumed samples: 5062656 | consumed tokens: 648790016 | elapsed time per iteration (ms): 129855.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.717428E+00 | loss scale: 65536.0 | grad norm: 29281.923 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2473/ 292968 | consumed samples: 5064704 | consumed tokens: 649199616 | elapsed time per iteration (ms): 131775.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.705631E+00 | loss scale: 65536.0 | grad norm: 28659.207 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2474/ 292968 | consumed samples: 5066752 | consumed tokens: 649609216 | elapsed time per iteration (ms): 133601.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.734337E+00 | loss scale: 65536.0 | grad norm: 23868.666 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2475/ 292968 | consumed samples: 5068800 | consumed tokens: 650018816 | elapsed time per iteration (ms): 131487.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.710527E+00 | loss scale: 65536.0 | grad norm: 21140.643 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2476/ 292968 | consumed samples: 5070848 | consumed tokens: 650428416 | elapsed time per iteration (ms): 128894.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.724619E+00 | loss scale: 65536.0 | grad norm: 22078.743 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2477/ 292968 | consumed samples: 5072896 | consumed tokens: 650838016 | elapsed time per iteration (ms): 126103.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.736965E+00 | loss scale: 65536.0 | grad norm: 24979.676 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2478/ 292968 | consumed samples: 5074944 | consumed tokens: 651247616 | elapsed time per iteration (ms): 127228.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.723377E+00 | loss scale: 65536.0 | grad norm: 27236.343 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2479/ 292968 | consumed samples: 5076992 | consumed tokens: 651657216 | elapsed time per iteration (ms): 125697.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.711486E+00 | loss scale: 65536.0 | grad norm: 21463.785 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2480/ 292968 | consumed samples: 5079040 | consumed tokens: 652066816 | elapsed time per iteration (ms): 127691.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.715552E+00 | loss scale: 65536.0 | grad norm: 20715.203 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2481/ 292968 | consumed samples: 5081088 | consumed tokens: 652476416 | elapsed time per iteration (ms): 127493.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.748083E+00 | loss scale: 65536.0 | grad norm: 23644.313 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2482/ 292968 | consumed samples: 5083136 | consumed tokens: 652886016 | elapsed time per iteration (ms): 127868.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.700185E+00 | loss scale: 65536.0 | grad norm: 39262.439 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2483/ 292968 | consumed samples: 5085184 | consumed tokens: 653295616 | elapsed time per iteration (ms): 126146.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.722850E+00 | loss scale: 65536.0 | grad norm: 43067.058 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2484/ 292968 | consumed samples: 5087232 | consumed tokens: 653705216 | elapsed time per iteration (ms): 129324.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.711660E+00 | loss scale: 65536.0 | grad norm: 38233.257 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2485/ 292968 | consumed samples: 5089280 | consumed tokens: 654114816 | elapsed time per iteration (ms): 129449.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.689920E+00 | loss scale: 65536.0 | grad norm: 25743.204 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2486/ 292968 | consumed samples: 5091328 | consumed tokens: 654524416 | elapsed time per iteration (ms): 125814.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.714564E+00 | loss scale: 65536.0 | grad norm: 27233.160 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2487/ 292968 | consumed samples: 5093376 | consumed tokens: 654934016 | elapsed time per iteration (ms): 130240.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.734846E+00 | loss scale: 65536.0 | grad norm: 27127.083 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2488/ 292968 | consumed samples: 5095424 | consumed tokens: 655343616 | elapsed time per iteration (ms): 127594.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.699123E+00 | loss scale: 65536.0 | grad norm: 20848.341 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2489/ 292968 | consumed samples: 5097472 | consumed tokens: 655753216 | elapsed time per iteration (ms): 125631.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.727104E+00 | loss scale: 65536.0 | grad norm: 25658.727 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2490/ 292968 | consumed samples: 5099520 | consumed tokens: 656162816 | elapsed time per iteration (ms): 127808.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.711399E+00 | loss scale: 65536.0 | grad norm: 24253.847 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2491/ 292968 | consumed samples: 5101568 | consumed tokens: 656572416 | elapsed time per iteration (ms): 127414.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.729037E+00 | loss scale: 65536.0 | grad norm: 22636.586 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2492/ 292968 | consumed samples: 5103616 | consumed tokens: 656982016 | elapsed time per iteration (ms): 126033.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.737489E+00 | loss scale: 65536.0 | grad norm: 20919.173 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2493/ 292968 | consumed samples: 5105664 | consumed tokens: 657391616 | elapsed time per iteration (ms): 127754.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.705382E+00 | loss scale: 65536.0 | grad norm: 19622.537 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2494/ 292968 | consumed samples: 5107712 | consumed tokens: 657801216 | elapsed time per iteration (ms): 125307.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.728056E+00 | loss scale: 65536.0 | grad norm: 27467.263 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2495/ 292968 | consumed samples: 5109760 | consumed tokens: 658210816 | elapsed time per iteration (ms): 126630.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.719722E+00 | loss scale: 65536.0 | grad norm: 34198.566 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2496/ 292968 | consumed samples: 5111808 | consumed tokens: 658620416 | elapsed time per iteration (ms): 128610.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.718237E+00 | loss scale: 65536.0 | grad norm: 35073.791 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2497/ 292968 | consumed samples: 5113856 | consumed tokens: 659030016 | elapsed time per iteration (ms): 126763.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.691556E+00 | loss scale: 65536.0 | grad norm: 29139.525 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2498/ 292968 | consumed samples: 5115904 | consumed tokens: 659439616 | elapsed time per iteration (ms): 125641.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.724890E+00 | loss scale: 65536.0 | grad norm: 23439.934 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2499/ 292968 | consumed samples: 5117952 | consumed tokens: 659849216 | elapsed time per iteration (ms): 127715.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.716745E+00 | loss scale: 65536.0 | grad norm: 19943.741 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2500/ 292968 | consumed samples: 5120000 | consumed tokens: 660258816 | elapsed time per iteration (ms): 128601.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.712626E+00 | loss scale: 131072.0 | grad norm: 20295.690 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2501/ 292968 | consumed samples: 5122048 | consumed tokens: 660668416 | elapsed time per iteration (ms): 131174.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.713100E+00 | loss scale: 131072.0 | grad norm: 36931.195 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2502/ 292968 | consumed samples: 5124096 | consumed tokens: 661078016 | elapsed time per iteration (ms): 131175.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.694582E+00 | loss scale: 131072.0 | grad norm: 49927.205 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2503/ 292968 | consumed samples: 5126144 | consumed tokens: 661487616 | elapsed time per iteration (ms): 129343.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.724925E+00 | loss scale: 131072.0 | grad norm: 60177.454 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2504/ 292968 | consumed samples: 5128192 | consumed tokens: 661897216 | elapsed time per iteration (ms): 127507.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.701005E+00 | loss scale: 131072.0 | grad norm: 50856.707 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2505/ 292968 | consumed samples: 5130240 | consumed tokens: 662306816 | elapsed time per iteration (ms): 129902.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.690974E+00 | loss scale: 131072.0 | grad norm: 53157.344 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2506/ 292968 | consumed samples: 5132288 | consumed tokens: 662716416 | elapsed time per iteration (ms): 128518.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.698850E+00 | loss scale: 131072.0 | grad norm: 54977.648 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2507/ 292968 | consumed samples: 5134336 | consumed tokens: 663126016 | elapsed time per iteration (ms): 127751.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.727875E+00 | loss scale: 131072.0 | grad norm: 59344.173 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2508/ 292968 | consumed samples: 5136384 | consumed tokens: 663535616 | elapsed time per iteration (ms): 130017.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.720293E+00 | loss scale: 131072.0 | grad norm: 45567.528 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2509/ 292968 | consumed samples: 5138432 | consumed tokens: 663945216 | elapsed time per iteration (ms): 131387.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.712830E+00 | loss scale: 131072.0 | grad norm: 41242.503 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2510/ 292968 | consumed samples: 5140480 | consumed tokens: 664354816 | elapsed time per iteration (ms): 131183.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.711005E+00 | loss scale: 131072.0 | grad norm: 49437.526 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2511/ 292968 | consumed samples: 5142528 | consumed tokens: 664764416 | elapsed time per iteration (ms): 134500.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.709109E+00 | loss scale: 131072.0 | grad norm: 55609.251 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2512/ 292968 | consumed samples: 5144576 | consumed tokens: 665174016 | elapsed time per iteration (ms): 135374.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.734447E+00 | loss scale: 131072.0 | grad norm: 43249.036 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2513/ 292968 | consumed samples: 5146624 | consumed tokens: 665583616 | elapsed time per iteration (ms): 130829.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.699287E+00 | loss scale: 131072.0 | grad norm: 35654.330 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2514/ 292968 | consumed samples: 5148672 | consumed tokens: 665993216 | elapsed time per iteration (ms): 133991.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.718992E+00 | loss scale: 131072.0 | grad norm: 37759.592 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2515/ 292968 | consumed samples: 5150720 | consumed tokens: 666402816 | elapsed time per iteration (ms): 129624.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.719767E+00 | loss scale: 131072.0 | grad norm: 49193.514 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2516/ 292968 | consumed samples: 5152768 | consumed tokens: 666812416 | elapsed time per iteration (ms): 126860.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.729679E+00 | loss scale: 131072.0 | grad norm: 70559.533 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2517/ 292968 | consumed samples: 5154816 | consumed tokens: 667222016 | elapsed time per iteration (ms): 131649.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.677740E+00 | loss scale: 131072.0 | grad norm: 56023.485 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2518/ 292968 | consumed samples: 5156864 | consumed tokens: 667631616 | elapsed time per iteration (ms): 133698.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.696238E+00 | loss scale: 131072.0 | grad norm: 57083.392 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2519/ 292968 | consumed samples: 5158912 | consumed tokens: 668041216 | elapsed time per iteration (ms): 133130.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.712332E+00 | loss scale: 131072.0 | grad norm: 66522.220 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2520/ 292968 | consumed samples: 5160960 | consumed tokens: 668450816 | elapsed time per iteration (ms): 132776.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.699082E+00 | loss scale: 131072.0 | grad norm: 52981.553 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2521/ 292968 | consumed samples: 5163008 | consumed tokens: 668860416 | elapsed time per iteration (ms): 133609.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.677561E+00 | loss scale: 131072.0 | grad norm: 49201.207 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2522/ 292968 | consumed samples: 5165056 | consumed tokens: 669270016 | elapsed time per iteration (ms): 134264.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.699126E+00 | loss scale: 131072.0 | grad norm: 38187.609 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2523/ 292968 | consumed samples: 5167104 | consumed tokens: 669679616 | elapsed time per iteration (ms): 133050.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.711785E+00 | loss scale: 131072.0 | grad norm: 50523.507 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2524/ 292968 | consumed samples: 5169152 | consumed tokens: 670089216 | elapsed time per iteration (ms): 129836.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.695742E+00 | loss scale: 131072.0 | grad norm: 54330.129 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2525/ 292968 | consumed samples: 5171200 | consumed tokens: 670498816 | elapsed time per iteration (ms): 136356.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.732819E+00 | loss scale: 131072.0 | grad norm: 39968.544 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2526/ 292968 | consumed samples: 5173248 | consumed tokens: 670908416 | elapsed time per iteration (ms): 134571.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.712886E+00 | loss scale: 131072.0 | grad norm: 51363.977 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2527/ 292968 | consumed samples: 5175296 | consumed tokens: 671318016 | elapsed time per iteration (ms): 132047.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.695562E+00 | loss scale: 131072.0 | grad norm: 51765.676 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2528/ 292968 | consumed samples: 5177344 | consumed tokens: 671727616 | elapsed time per iteration (ms): 134158.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.707468E+00 | loss scale: 131072.0 | grad norm: 54323.308 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2529/ 292968 | consumed samples: 5179392 | consumed tokens: 672137216 | elapsed time per iteration (ms): 131022.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.695577E+00 | loss scale: 131072.0 | grad norm: 41546.541 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2530/ 292968 | consumed samples: 5181440 | consumed tokens: 672546816 | elapsed time per iteration (ms): 137329.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.701566E+00 | loss scale: 131072.0 | grad norm: 42285.909 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2531/ 292968 | consumed samples: 5183488 | consumed tokens: 672956416 | elapsed time per iteration (ms): 135951.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.682348E+00 | loss scale: 131072.0 | grad norm: 55894.421 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2532/ 292968 | consumed samples: 5185536 | consumed tokens: 673366016 | elapsed time per iteration (ms): 134684.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.719293E+00 | loss scale: 131072.0 | grad norm: 64429.092 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2533/ 292968 | consumed samples: 5187584 | consumed tokens: 673775616 | elapsed time per iteration (ms): 139215.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.718337E+00 | loss scale: 131072.0 | grad norm: 49058.682 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2534/ 292968 | consumed samples: 5189632 | consumed tokens: 674185216 | elapsed time per iteration (ms): 140178.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.713611E+00 | loss scale: 131072.0 | grad norm: 66713.209 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2535/ 292968 | consumed samples: 5191680 | consumed tokens: 674594816 | elapsed time per iteration (ms): 137068.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.720226E+00 | loss scale: 131072.0 | grad norm: 70072.153 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2536/ 292968 | consumed samples: 5193728 | consumed tokens: 675004416 | elapsed time per iteration (ms): 133750.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.686594E+00 | loss scale: 131072.0 | grad norm: 47463.962 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2537/ 292968 | consumed samples: 5195776 | consumed tokens: 675414016 | elapsed time per iteration (ms): 134502.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.718798E+00 | loss scale: 131072.0 | grad norm: 75553.129 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2538/ 292968 | consumed samples: 5197824 | consumed tokens: 675823616 | elapsed time per iteration (ms): 132873.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.676980E+00 | loss scale: 131072.0 | grad norm: 72938.459 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2539/ 292968 | consumed samples: 5199872 | consumed tokens: 676233216 | elapsed time per iteration (ms): 136842.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.690705E+00 | loss scale: 131072.0 | grad norm: 63805.103 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2540/ 292968 | consumed samples: 5201920 | consumed tokens: 676642816 | elapsed time per iteration (ms): 138332.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.699246E+00 | loss scale: 131072.0 | grad norm: 60131.574 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2541/ 292968 | consumed samples: 5203968 | consumed tokens: 677052416 | elapsed time per iteration (ms): 137209.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.687837E+00 | loss scale: 131072.0 | grad norm: 57555.686 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2542/ 292968 | consumed samples: 5206016 | consumed tokens: 677462016 | elapsed time per iteration (ms): 135834.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.714507E+00 | loss scale: 131072.0 | grad norm: 56971.731 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2543/ 292968 | consumed samples: 5208064 | consumed tokens: 677871616 | elapsed time per iteration (ms): 133073.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.731538E+00 | loss scale: 131072.0 | grad norm: 53881.397 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2544/ 292968 | consumed samples: 5210112 | consumed tokens: 678281216 | elapsed time per iteration (ms): 130908.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.711176E+00 | loss scale: 131072.0 | grad norm: 46917.614 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2545/ 292968 | consumed samples: 5212160 | consumed tokens: 678690816 | elapsed time per iteration (ms): 130423.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.679188E+00 | loss scale: 131072.0 | grad norm: 44071.737 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2546/ 292968 | consumed samples: 5214208 | consumed tokens: 679100416 | elapsed time per iteration (ms): 138117.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.696059E+00 | loss scale: 131072.0 | grad norm: 52838.337 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2547/ 292968 | consumed samples: 5216256 | consumed tokens: 679510016 | elapsed time per iteration (ms): 134088.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.703378E+00 | loss scale: 131072.0 | grad norm: 60797.403 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2548/ 292968 | consumed samples: 5218304 | consumed tokens: 679919616 | elapsed time per iteration (ms): 134911.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.702210E+00 | loss scale: 131072.0 | grad norm: 50331.478 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -saving checkpoint at iteration 2548 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-28 17:13:39,733] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/mp_rank_00_model_states.pt -[2021-10-28 17:13:39,941] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/mp_rank_01_model_states.pt -[2021-10-28 17:13:53,455] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-28 17:13:53,470] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-28 17:13:53,486] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-28 17:13:53,523] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-28 17:13:53,532] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-28 17:13:53,559] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-28 17:13:53,578] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-28 17:13:53,597] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-28 17:13:53,607] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-28 17:13:53,662] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-28 17:13:53,665] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-28 17:13:53,672] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-28 17:13:53,688] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-28 17:13:53,718] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-28 17:13:53,753] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-28 17:13:53,787] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-28 17:13:53,798] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-28 17:13:53,808] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-28 17:13:53,817] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-28 17:13:53,876] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-28 17:13:53,882] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-28 17:13:53,882] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-28 17:13:53,884] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-28 17:13:53,929] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-28 17:13:53,954] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-28 17:13:53,966] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-28 17:13:54,034] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-28 17:13:54,043] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-28 17:13:54,206] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-28 17:13:54,207] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-28 17:13:54,220] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-28 17:13:54,542] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-28 17:13:54,564] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-28 17:13:54,565] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-28 17:13:54,589] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-28 17:13:54,591] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-28 17:13:54,600] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-28 17:13:54,615] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-28 17:13:54,620] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-28 17:13:54,624] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-28 17:13:54,631] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-28 17:13:54,645] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-28 17:13:54,650] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-28 17:13:54,679] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-28 17:13:54,679] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-28 17:13:54,681] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-28 17:13:54,689] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-28 17:13:54,690] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-28 17:13:54,698] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-28 17:13:54,707] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-28 17:13:54,717] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-28 17:13:54,725] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-28 17:13:54,780] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-28 17:13:54,790] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-28 17:13:54,796] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-28 17:13:54,804] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-28 17:13:54,819] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-28 17:13:54,848] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-28 17:13:54,859] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-28 17:13:54,874] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-28 17:13:54,895] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-28 17:13:54,899] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-28 17:13:54,914] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-28 17:13:54,916] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-28 17:13:54,941] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-28 17:13:54,965] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-28 17:13:54,974] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-28 17:13:54,975] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-28 17:13:54,978] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-28 17:13:54,983] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-28 17:13:55,002] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-28 17:13:55,008] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-28 17:13:55,015] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-28 17:13:55,030] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-28 17:13:55,034] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-28 17:13:55,047] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-28 17:13:55,079] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-28 17:13:55,080] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-28 17:13:55,092] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-28 17:13:55,098] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-28 17:13:55,101] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-28 17:13:55,127] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-28 17:13:55,144] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-28 17:13:55,160] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-28 17:13:55,248] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-28 17:13:55,250] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-28 17:13:55,252] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-28 17:13:55,264] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-28 17:13:55,392] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-28 17:13:55,441] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-28 17:13:55,464] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-28 17:13:55,506] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-28 17:13:55,520] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-28 17:13:55,559] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-28 17:13:55,563] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-28 17:13:55,590] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-28 17:13:55,596] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-28 17:13:55,603] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-28 17:13:55,668] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-28 17:13:55,685] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-28 17:13:55,693] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-28 17:13:55,749] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-28 17:13:55,789] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-28 17:13:55,801] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-28 17:13:56,296] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-28 17:13:56,445] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-28 17:13:56,612] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-28 17:13:56,740] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-28 17:13:57,647] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-28 17:13:57,658] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-28 17:13:57,909] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-28 17:13:57,967] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-28 17:13:58,010] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-28 17:13:58,032] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-28 17:14:00,209] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-28 17:14:00,309] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-28 17:14:00,312] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-28 17:14:00,743] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-28 17:14:00,748] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-28 17:14:01,156] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-28 17:14:01,205] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-28 17:14:02,800] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-28 17:14:03,558] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-28 17:14:04,345] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-28 17:14:04,864] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-28 17:14:05,058] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-28 17:14:05,492] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-28 17:14:06,165] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2548/zero_pp_rank_0_mp_rank_72_optim_states.pt - successfully saved checkpoint at iteration 2548 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 29252.10 -[exiting program after 1190.94853798151 minutes] datetime: 2021-10-28 17:14:06 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja .................................... .................. [OKAY]..................[OKAY] - [OKAY] ---------------------------------------------------[OKAY] - --------------------------------------------------- --------------------------------------------------- -op name ---------------------------------------------------op name -op name ................................op name................ installed installed ................ installed.. .. installed ..compatible compatible -.. -compatible-------------------------------------------------- -------------------------------------------------- -compatible --------------------------------------------------- - --------------------------------------------------- - -cpu_adam cpu_adam............... cpu_adam cpu_adam............... [NO] [NO]............... ............... .......[NO]....... [NO] ....... [OKAY][OKAY] -....... - [OKAY][OKAY] - -fused_adamfused_adam .......................... fused_adam [NO] [NO] fused_adam............. .............. [OKAY][OKAY] - ............. -[NO]fused_lamb [NO]....... .............fused_lamb ....... [OKAY] [NO] -............. .......[NO] fused_lamb [OKAY][OKAY] ....... - -............. [OKAY][NO] - ....... [OKAY] -fused_lamb ............. [NO] .......sparse_attnsparse_attn ........................ sparse_attn[OKAY][NO] [NO] -............ ..............[NO] [OKAY][OKAY]....... - - [OKAY] -transformertransformer ........................transformer [NO][NO]............ ..............[NO] [OKAY][OKAY]....... - -[OKAY] -stochastic_transformersparse_attnstochastic_transformerstochastic_transformer .............. . [NO][NO][NO] [NO] .............. ....... .......[OKAY] [OKAY] [OKAY] - -[OKAY] - -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop name op name op name................ ................ ................................installed installedinstalledinstalled.. ....compatible.. compatible -compatible - ---------------------------------------------------compatible-------------------------------------------------- --------------------------------------------------- - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adamcpu_adam............... ..............................[NO]............... [NO][NO][NO]....... .......[OKAY].............. - [OKAY][OKAY][OKAY] - - -fused_adam ............. [NO]fused_adamfused_adamfused_adam .............................................. [OKAY][NO] -[NO] [NO] ....... ....... .......fused_lamb [OKAY] [OKAY] -[OKAY]............. - - [NO] fused_lamb....... fused_lamb fused_lamb .............[OKAY] ............. -.............[NO] [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - -sparse_attn ............ [NO] .......sparse_attn sparse_attn sparse_attn[OKAY]........................ - ............[NO][NO] transformer[NO].............. ...................[OKAY][OKAY] -[OKAY] -[NO] -transformer .......transformertransformer............ ............[OKAY] [NO]............ -[NO] .......[NO]....... stochastic_transformer[OKAY]....... -[OKAY][OKAY]. - - [NO]stochastic_transformer .......stochastic_transformerstochastic_transformer [OKAY]. -.. [NO][NO] [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op name op name................ ................installed ..installed compatible.. - --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam ...............cpu_adam [NO] ...................... [NO][OKAY] ....... [OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO]fused_adam ....... [OKAY] -............. [NO] .......sparse_attn [OKAY]............ - [NO] ....... [OKAY] -transformer ............ [NO]fused_lamb ....... [OKAY] -stochastic_transformer............. [NO] . [NO] ....... [OKAY] -....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninja ...................................................... [OKAY][OKAY][OKAY] - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op nameop name op name ................ ................................installed installedinstalled.. ..compatible .. -compatible -------------------------------------------------- -compatible - --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam ...............[NO]............... [NO].......[NO] ..............[OKAY] -[OKAY][OKAY] - -fused_adamfused_adamfused_adam ....................................... [NO][NO] [NO]....... .......[OKAY]....... - [OKAY][OKAY] - -fused_lamb fused_lamb.............fused_lamb .............[NO]............. [NO].......[NO] [OKAY].............. - [OKAY][OKAY] - -sparse_attnsparse_attn sparse_attn ............ ........................[NO] [NO].......[NO] .......[OKAY]....... - [OKAY][OKAY] - -transformer transformer............transformer ............[NO]............ [NO] .......[NO]....... [OKAY].......[OKAY] - -[OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer . .[NO]. [NO] ....... [NO] ....... [OKAY] ....... -[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninja ...................................................... [OKAY] [OKAY] -[OKAY] - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------op nameop name - ................................ op name installed installed .................... compatibleinstalledcompatible - ---------------------------------------------------..-------------------------------------------------- - - compatible --------------------------------------------------- -cpu_adam cpu_adam............... ...............[NO] cpu_adam .......[NO] [OKAY]...................... - [OKAY][NO] - ....... [OKAY] -fused_adam ............. [NO] .......fused_adam [OKAY].............fused_adam - .............fused_lamb[NO] [NO].................... [NO][OKAY] ....... -....... [OKAY][OKAY]fused_lamb - - ............. [NO] ....... fused_lamb[OKAY] -............. [NO] sparse_attn....... ............[OKAY] [NO] - ....... [OKAY]sparse_attn - ............transformer [NO]............ .......[NO] [OKAY]....... -sparse_attn transformer[OKAY]............ - ............ [NO]stochastic_transformer[NO] ............... [OKAY][OKAY][NO] - - ....... stochastic_transformer[OKAY] -transformer ............. [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name -op nameop name ................................op name................ ................ installedinstalledinstalled installed...... compatible compatible.. -compatible - -------------------------------------------------- ---------------------------------------------------compatible - --------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam.............................. cpu_adam ............... [NO] [NO]...............[NO]....... [NO][OKAY] ....... - ....... ....... [OKAY] [OKAY] - -[OKAY] -fused_adam ............. [NO] .......fused_adam fused_adamfused_adam[OKAY] - .......................................fused_lamb [NO].............[NO][NO] .......[NO].............. [OKAY] .......[OKAY] -[OKAY] -[OKAY] - -fused_lambfused_lambfused_lamb ....................................... [NO][NO][NO] ..............sparse_attn ....... [OKAY][OKAY]............[OKAY] - - -[NO] ....... [OKAY] -transformer ............ sparse_attn[NO] ...................sparse_attnsparse_attn [OKAY]............[NO] -............ [NO][NO] .......stochastic_transformer ....... ....... [OKAY].[OKAY] - -[NO][OKAY] transformer -.......transformer transformer [OKAY]........................ - ............[NO][NO] .......[NO]....... [OKAY].......[OKAY] - -[OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer .. . [NO] [NO] [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- -----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] - -[OKAY] ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - ---------------------------------------------------op nameop nameop name - ................................................ op name installedinstalled installed .. .. ................ ..compatiblecompatible - - installed----------------------------------------------------------------------------------------------------compatible - - -.. --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam cpu_adam............... cpu_adam[NO]...............cpu_adam .....................................[NO] [OKAY][NO]....... [NO][OKAY] - -.............. [OKAY][OKAY] - -fused_adam ............. [NO]fused_adam .................... [OKAY][NO]fused_adam -fused_adam fused_lamb....... ............. .............[OKAY] ............. - [NO][NO][NO] .......fused_lamb.............. ............. [OKAY][OKAY] [OKAY] - -[NO] - ....... [OKAY]fused_lamb - fused_lamb............. .............[NO] [NO]....... sparse_attn ....... [OKAY] ............ - [OKAY]sparse_attn[NO] - ................... [NO][OKAY] -....... [OKAY]transformer - ............sparse_attn transformer [NO] sparse_attn............................... [NO]............[NO][OKAY] - .......[NO] ....... stochastic_transformer....... [OKAY] [OKAY] - -.[OKAY] transformer[NO] - .......stochastic_transformertransformer............ ............ [OKAY][NO] -.[NO] ....... [NO] ....... [OKAY] ....... -[OKAY] -[OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY] -[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - ---------------------------------------------------op nameop name - --------------------------------------------------................................op name - installedop nameinstalled................ .... ................ installed compatible compatibleinstalled - -.. ----------------------------------------------------------------------------------------------------.. - - compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [NO]cpu_adam ...................... cpu_adam cpu_adam[OKAY][NO] - ..................................... [NO][NO][OKAY] - ....... .......[OKAY] fused_adam -[OKAY] -............. [NO] ....... [OKAY]fused_adam - ............. fused_lamb[NO] fused_adam .............fused_adam ....... [NO]..........................[OKAY] -.......[NO][NO] [OKAY]fused_lamb.............. - .............[OKAY] [OKAY] -[NO] - .......fused_lamb fused_lamb[OKAY] sparse_attn.......................... - ............[NO] [NO] [NO] ....... ....... ....... [OKAY][OKAY] -sparse_attn -[OKAY] ............ - [NO] transformer....... ............[OKAY] -[NO] ....... [OKAY]transformer - sparse_attnsparse_attn............ stochastic_transformer............[NO]............ .[NO]....... [NO] [OKAY][NO] -....... .............. stochastic_transformer [OKAY][OKAY] [OKAY] - - -. transformertransformer [NO] ............................... [NO][NO][OKAY] - .............. [OKAY][OKAY] - -stochastic_transformer stochastic_transformer. .[NO] [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -async_io ............... [NO] ....... [NO] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [WARNING]  async_io: please install the libaio-devel package with yum [NO] ....... -[OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO] [NO]....... .......[NO] -[NO] -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utils utils.................. [NO].................. .......[NO] [OKAY]....... - [OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... transformer_inference[OKAY] .. [NO] ....... - [OKAY] -utils .................. utils[NO] .................. [NO] ....... [OKAY]....... - [OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... [OKAY] -.......-------------------------------------------------- -[OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer quantizer.............. [NO].............. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils .................. utils[NO] ......................... [NO][OKAY] -utils .................. [NO] ....... [OKAY] -....... [OKAY] -quantizer  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found............... -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] - [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -deepspeed wheel compiled w. ...... DeepSpeed general environment info:torch 1.8, cuda 11.1 - -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -torch version .................... 1.8.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -------------------------------------------------------------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report --------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version ....................torch version 1.8.1.................... - 1.8.1torch cuda version - ............... torch cuda version11.1 -...............nvcc version 11.1..................... - nvcc version11.2 -.....................deepspeed install path 11.2........... - deepspeed install path ...........['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ...................['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -0.5.5+29bee73, 29bee73, masterdeepspeed info - deepspeed wheel compiled w.................... ......0.5.5+29bee73, 29bee73, master -torch 1.8, cuda 11.1deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - - op name................op nameop name ................................installed ................ installed installed..installed ..compatible .... -compatible ---------------------------------------------------compatiblecompatible-------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam ...............cpu_adam............... cpu_adam [NO] [NO].............................. .............. [NO] [NO] [OKAY] -[OKAY].............. - [OKAY][OKAY] - -fused_adam .............fused_adam [NO]............. fused_adam.......[NO]fused_adam [OKAY]................................. - [OKAY] fused_lamb -[NO][NO] ....................fused_lamb ....... [OKAY] -[NO].............[OKAY] -.......[NO]fused_lamb [OKAY]....................fused_lamb - [NO][OKAY]............. - .......[NO] [OKAY]....... - [OKAY] -sparse_attn ............ sparse_attn[NO] ................... [NO][OKAY] sparse_attn -....... sparse_attntransformer ............ [OKAY] ........................ -[NO] transformer [NO]....... [NO] ...................[OKAY] -[NO][OKAY].......transformer - .......[OKAY]............ -stochastic_transformer [OKAY][NO]transformer - .................... [NO] [OKAY]stochastic_transformer.......[NO] -. [OKAY][NO] -stochastic_transformer....... .......[OKAY] -.[OKAY] -[NO]stochastic_transformer ....... [OKAY]. - [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. -...... deepspeed wheel compiled w.torch 1.8, cuda 11.1 -...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - ---------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -op nameop nameop name op name ................ ................................................ installedinstalledinstalledinstalled .. .. .... compatible compatible compatible - -compatible --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -cpu_adam ...............cpu_adamcpu_adam [NO]cpu_adam ............... ............... ...................... [NO][NO][OKAY][NO] -....... .......[OKAY]....... - [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY]fused_adam - fused_adam............. fused_adamfused_lamb.............[NO] ............. .............[NO][NO] ....... [NO]....... ..............[OKAY] [OKAY] -[OKAY] - -[OKAY] -fused_lamb fused_lamb............. fused_lamb .............[NO] .............[NO] .......sparse_attn [NO] ....... [OKAY] ................... -[OKAY] -[OKAY][NO] - ....... [OKAY] -transformer ............ [NO] sparse_attnsparse_attn....... ............sparse_attn ............[NO][OKAY] -............ [NO] .......[NO].......stochastic_transformer [OKAY][OKAY]....... -. - transformer[NO][OKAY]transformer -............ ....... ............ transformer[NO][OKAY][NO] -.......................... [OKAY][NO][OKAY] - -....... [OKAY]stochastic_transformer - stochastic_transformer ..stochastic_transformer [NO][NO] ............... [OKAY][OKAY][NO] - - ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. DeepSpeed general environment info:...... torch 1.8, cuda 11.1 - -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path DeepSpeed general environment info:........... -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ...................torch install path 0.5.5+29bee73, 29bee73, master - deepspeed wheel compiled w................ ...... torch 1.8, cuda 11.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY] [OKAY] - -[OKAY]-------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------op nameop name --------------------------------------------------- -................op name................ op nameinstalled installed ................ .... ................compatiblecompatibleinstalled - - --------------------------------------------------installed-------------------------------------------------- -.. - ..compatible -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... cpu_adam[NO] ...................... cpu_adam [NO]cpu_adam[OKAY] - ..................................... [OKAY][NO][NO] - .............. [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY]fused_adam - .............fused_adam fused_adamfused_lamb[NO] ................................. ............. [OKAY][NO] -[NO][NO] .............. fused_lamb....... .............[OKAY] [OKAY] -[OKAY][NO] - - ....... [OKAY]fused_lambfused_lamb - .......................... [NO][NO] .............. [OKAY][OKAY] -sparse_attn - ............ [NO] sparse_attn....... ............[OKAY] -[NO] .......transformer [OKAY]............ -sparse_attn sparse_attntransformer[NO] ....... ............ ........................[OKAY] - [NO][NO][NO] stochastic_transformer..................... [OKAY][OKAY]. -[OKAY] - -[NO] ....... stochastic_transformertransformertransformer[OKAY] -......................... [NO][NO][NO] ..................... [OKAY] -[OKAY][OKAY] - -stochastic_transformer stochastic_transformer. [NO] ........ [OKAY][NO] - ....... [OKAY] ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja ---------------------------------------------------JIT compiled ops requires ninja -JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................................................................ installed installedinstalled installed .... ..compatible..compatible - -compatible ---------------------------------------------------------------------------------------------------- -compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adamcpu_adam cpu_adam ............... cpu_adam..............................[NO] [NO][NO]....... ............... ....... .......[OKAY] [NO][OKAY] - -[OKAY]....... - [OKAY] -fused_adamfused_adam fused_adam .............fused_adam ............. ............. .............[NO][NO][NO] [NO]..................... [OKAY].......[OKAY][OKAY] - - - [OKAY] -fused_lambfused_lamb fused_lamb............. fused_lamb.............[NO]............. [NO].................... [NO] ....... [OKAY] [NO] -[OKAY]....... - ....... [OKAY][OKAY] - -sparse_attn ............sparse_attn [NO]............ .......[NO]sparse_attn sparse_attn[OKAY] ....... - ............ ............[OKAY] transformer -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name - [NO] [NO] ...................transformer ...................[OKAY][NO] - op nameop name op name ................................ ................................ installedinstalled installedinstalled.... .. compatiblecompatible - [NO].......[OKAY] transformer -.......[OKAY] -............[OKAY]transformer - .. --------------------------------------------------- compatible--------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- - -[NO] stochastic_transformer ............ ........stochastic_transformer[NO] [OKAY][NO]........ - [NO].......[OKAY] stochastic_transformer ....... - [OKAY] -[OKAY]. -cpu_adam ...............cpu_adam cpu_adam [NO]cpu_adam ............... ..................................... [NO] [NO].......[NO][OKAY] -stochastic_transformer [NO] ........ [OKAY][NO] - ....... [OKAY] - .......[OKAY]....... - [OKAY][OKAY] - -fused_adam ............. [NO] .......fused_adam [OKAY]............. - fused_adamfused_adam[NO]fused_lamb ....................................... ....... [NO] [NO][OKAY][NO] - ..................... [OKAY]fused_lamb -[OKAY] [OKAY] -............. - [NO] .......fused_lamb fused_lamb[OKAY]............. - .............[NO] sparse_attn[NO] .......................... [OKAY][NO][OKAY] -....... - sparse_attn[OKAY] -............ [NO]transformer ................... [OKAY][NO] - ....... transformer[OKAY] sparse_attn............sparse_attn - [NO]............ ............stochastic_transformer [NO]........ [NO][OKAY][NO]....... ....... - [OKAY] ....... -stochastic_transformer[OKAY] -[OKAY] -.transformer transformer [NO] ............ ............ ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report - - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -ninjaninjaninja ninja .................. .................. .................. ..................[OKAY][OKAY][OKAY] - - -[OKAY]------------------------------------------------------------------------------------------------------------------------------------------------------ - - - ---------------------------------------------------op nameop nameop name - ................................................op name installedinstalledinstalled................ ......installed compatiblecompatible..compatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -compatible --------------------------------------------------- -cpu_adam cpu_adamcpu_adam............... ..............................[NO] [NO][NO]....... cpu_adam..............[OKAY] - [OKAY][OKAY]............... - -[NO] fused_adam....... fused_adam[OKAY].............fused_adam .............[NO] ............. - [NO] ....... [NO] .......[OKAY]....... - [OKAY][OKAY] - -fused_lamb .............fused_lamb fused_lamb[NO] fused_adam............. ............. ....................[NO][NO] [OKAY][NO].............. - .......[OKAY] [OKAY] -[OKAY] - -fused_lamb ............. [NO] sparse_attnsparse_attn ............sparse_attn................... ............[NO] [NO]....... [OKAY][NO] ....... -[OKAY] -.......[OKAY] -[OKAY]transformer - transformer............ transformer............[NO] ............ .......[NO][NO] [OKAY].............. - sparse_attn [OKAY] [OKAY] -............ -stochastic_transformer stochastic_transformer. stochastic_transformer [NO][NO] ......... [NO] [NO] ....... [OKAY]....... ....... - [OKAY][OKAY][OKAY] - - -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference transformer_inference.. [NO].. .......[NO] [OKAY]....... - [OKAY] -utils ..................utils [NO].................. .......[NO] [OKAY]....... - [OKAY] -quantizer .............. [NO] quantizer....... ..............[OKAY] -[NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[NO] .......[OKAY] -[OKAY] -utils quantizer.................. ..............[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer --------------------------------------------------.............. - [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja .................. ...................................................... [OKAY][OKAY] [OKAY] - - -[OKAY]-------------------------------------------------- - -----------------------------------------------------------------------------------------------------op name - ---------------------------------------------------................op nameop name - installed................op name................ ..installed................ installedcompatible.. installed - .. --------------------------------------------------compatible - - ..--------------------------------------------------compatible - -compatible --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam ......................cpu_adam [NO] cpu_adam[OKAY] -..................................... [OKAY][NO][NO] - .............. [OKAY] -[OKAY] -fused_adam ............. fused_adam[NO] .................... fused_adam[OKAY][NO]fused_adam - ................................. [NO][NO]fused_lamb[OKAY] -........................... fused_lamb[NO][OKAY] [OKAY] -.................... - [NO]fused_lamb[OKAY] -fused_lamb ....... ............. ............. [OKAY] [NO] -[NO] .............. [OKAY][OKAY] - -sparse_attn ............ [NO] .......sparse_attn [OKAY]............ - [NO] sparse_attnsparse_attntransformer....... ........................[OKAY]............ [NO] -[NO][NO] .......transformer....... ....... [OKAY]............[OKAY] - -[OKAY][NO] -transformer....... ............transformerstochastic_transformer[OKAY] -............[NO]. stochastic_transformer[NO][NO]....... ............... [OKAY][OKAY][OKAY][NO] - - -....... [OKAY]stochastic_transformer - stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.async_io - ............... [NO] ....... [NO] -async_io ............... transformer_inference[NO] ......... [NO][NO] -....... [OKAY] -utils .................. [NO] ....... [OKAY]transformer_inference - .. [NO] .......quantizer [OKAY].............. - [NO] ....... utils[OKAY] -.................. [NO] ....... [OKAY]-------------------------------------------------- - -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... async_io[NO]async_io ....... ............... ............... [NO] [NO] -[NO] .............. [NO][NO] - -transformer_inference .. [NO] ....... [OKAY]transformer_inferencetransformer_inference - .... [NO]utils[NO] ................................ [NO][OKAY][OKAY] - -....... [OKAY] -utils ..................utils quantizer[NO].................. .....................[NO] [NO]....... [OKAY] ....... -[OKAY] -[OKAY] -quantizerquantizer ..............--------------------------------------------------.............. - [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -DeepSpeed general environment info: -async_io ...............async_io [NO] ...................... [NO] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -[NO] ....... [NO] -torch version .................... 1.8.1 -transformer_inference .. [NO] .......transformer_inference [OKAY].. -torch cuda version ............... 11.1 - [NO] ....... [OKAY]utils -nvcc version ..................... 11.2 - .................. [NO] ....... utils[OKAY] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -.................. [NO] quantizer....... ..............[OKAY] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -[NO] ....... [OKAY]quantizer - .............. [NO] --------------------------------------------------....... - [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -[NO] ....... [OKAY] --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report - - --------------------------------------------------- -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name ................op name................ ................installedinstalled................ installed..installed.. ..compatiblecompatible .. - - compatible---------------------------------------------------------------------------------------------------- -compatible - --------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam...............cpu_adam ............... ............... [NO]...............[NO] [NO]....... ....... [NO] .......[OKAY][OKAY] - -.......[OKAY] -[OKAY] -fused_adamfused_adam fused_adam ............. fused_adam.............[NO]............. ....................[NO][NO] [OKAY][NO].............. - .......[OKAY] -[OKAY][OKAY]fused_lamb - - fused_lamb............. fused_lambfused_lamb............. [NO] .............[NO] ............. .......[NO].......[NO] [OKAY][OKAY]....... -....... - [OKAY][OKAY] - -sparse_attn sparse_attn............ ............[NO]sparse_attn sparse_attn [NO]....... ............ ...................[OKAY][NO] - [OKAY][NO].......transformer - ....... [OKAY]transformer ............ - ............ [OKAY] [NO]transformer[NO] - .......................... transformer[OKAY][OKAY][NO] - - ................... [NO][OKAY]stochastic_transformer -stochastic_transformer ....... .stochastic_transformer[OKAY]. -[NO][NO]. stochastic_transformer .............. [NO] [OKAY][OKAY] - -........ [NO][OKAY] - ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- --------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - - -JIT compiled ops requires ninja--------------------------------------------------JIT compiled ops requires ninja -JIT compiled ops requires ninja - - -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja .................. .................................... ..................[OKAY][OKAY][OKAY] - - -[OKAY]-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------op nameop name -op name ................................op name................ ................installedinstalledinstalled installed...... ..compatiblecompatiblecompatible - - -compatible-------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -cpu_adamcpu_adam cpu_adamcpu_adam.............................. ............... ...............[NO][NO] [NO][NO].............. .......[OKAY].......[OKAY] - -[OKAY][OKAY] - -fused_adamfused_adam .......................... fused_adamfused_adam [NO] [NO] .......................... ....... ....... [NO][NO] [OKAY] [OKAY] - -....... .......[OKAY]fused_lamb - [OKAY]fused_lamb............. - .............[NO] [NO].......fused_lambfused_lamb .......[OKAY]............. ............. - [OKAY] [NO] -[NO] .............. [OKAY][OKAY] - -sparse_attn ............ [NO]sparse_attn ................... [OKAY][NO]sparse_attn - sparse_attn ....... ............ ............ [OKAY] [NO] -transformer[NO] .......................... transformer [OKAY][NO] [OKAY] - ............ -....... [NO][OKAY] transformer -.......transformer ............ [OKAY] ............ -[NO]stochastic_transformer [NO]....... stochastic_transformer........[OKAY] -[OKAY][NO] -. .......[NO] stochastic_transformer [OKAY] ....... - stochastic_transformer.[OKAY] -[NO]. .......[NO] [OKAY] -....... [OKAY] -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY] -[OKAY][OKAY] --------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op name --------------------------------------------------- - op name -op name................ ................op name installed installed................ ................ .... installed installedcompatiblecompatible.. - - ..----------------------------------------------------------------------------------------------------compatible - - -compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam .............................. cpu_adam[NO][NO]cpu_adam ............................. ............... [OKAY] [OKAY][NO] - -[NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] fused_adam..............fused_adam [OKAY].............[OKAY]............. - - [NO][NO] ..............fused_lamb fused_lamb [OKAY]............. [OKAY]............. - -[NO] [NO]....... .......[OKAY]fused_lambfused_lamb - [OKAY] .......................... - [NO][NO] .............. [OKAY][OKAY] - -sparse_attn ............ [NO]sparse_attn ................... [OKAY][NO] - .......sparse_attn transformer [OKAY] sparse_attn -........................ transformer............[NO][NO] ............ [NO].............. .......[NO][OKAY][OKAY] -....... -[OKAY] -[OKAY]transformerstochastic_transformer - transformer............ .............stochastic_transformer [NO] [NO][NO]........ ....... [OKAY] .......[NO][OKAY] - -.......[OKAY]stochastic_transformer - [OKAY] -.stochastic_transformer [NO] ........ [NO][OKAY] -....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... .................. [OKAY][OKAY]..................[OKAY] - - -[OKAY]------------------------------------------------------------------------------------------------------------------------------------------------------ - - - -op nameop nameop name --------------------------------------------------................................................ installed -ninjaninjaninjaninja .................. .................. .................. [OKAY].................. [OKAY] -[OKAY][OKAY] - --------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------- -op name - -installedinstalled op name...... ................compatiblecompatiblecompatible - -installed ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -.. compatible --------------------------------------------------- - op name................op name op name ................installed................................ installedinstalled..installed ......compatible - compatiblecompatible ---------------------------------------------------compatible - --------------------------------------------------- --------------------------------------------------- - -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY] -[OKAY] - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam ...............[NO]............... cpu_adam [NO] [NO] .................................... [OKAY][OKAY][NO][OKAY] - - -....... [OKAY] --------------------------------------------------- - ---------------------------------------------------op nameop nameop name -cpu_adam ...............cpu_adam [NO]cpu_adam............... cpu_adam......................[NO] ...............[NO].......[OKAY] - ................op name................................ installedinstalled................ installed .. .. installedcompatible.. -.......[NO][OKAY] -[OKAY]....... - [OKAY] - ..compatible--------------------------------------------------compatible - - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -fused_adamfused_adam fused_adam ............. fused_adam.......................... [NO] ....................[NO][NO] [NO][OKAY]....... ....... -fused_adam ............. fused_adam[NO] fused_adam.................... [NO]fused_adam[OKAY] ............. ....... -cpu_adam ...............cpu_adam cpu_adamcpu_adam [NO]............... ............... ......................[NO][NO] [OKAY][NO]....... -.......[OKAY] -[OKAY]fused_lamb[OKAY] - - ............. [NO] [OKAY] [NO]....... - .......fused_lamb[OKAY] - fused_lamb[OKAY]............. -....... [OKAY] ....... -[OKAY] -[OKAY] -fused_lamb............. .............[NO]fused_lambfused_lamb [NO]................................. ....... [OKAY][OKAY][NO] - - fused_lamb.............[NO] .............[NO].......fused_lamb [NO].......[OKAY] ............. -.......[OKAY] -[NO][OKAY] -fused_adam ............. fused_adamfused_adam[NO] ....... fused_adam............. .............[OKAY][NO]............. - [NO]....... .......[OKAY] -[OKAY] -....... [OKAY] - [NO]....... [NO][OKAY] fused_lamb -sparse_attnsparse_attn ........................ sparse_attnsparse_attn[NO][NO] ...................................... [OKAY] [OKAY] -sparse_attn ............ sparse_attn[NO] ................... [NO][OKAY]sparse_attn -[NO][NO] - .............. transformer transformer[OKAY] - .......sparse_attn ............ ............ transformer[OKAY][NO] - ...........................fused_lamb [OKAY] [OKAY]............. -[NO] -............[OKAY]............ - [NO]............transformer....... ....... [NO] [OKAY]............[OKAY] - ....... -[NO] transformer[OKAY].......transformer - [NO]....... .......fused_lamb[OKAY] fused_lamb -[OKAY]............. - transformer[NO][NO] transformer.............. ........................ [OKAY] [OKAY] -[NO] - ........................ [OKAY] [NO]stochastic_transformer - [NO] ....... stochastic_transformer .......[OKAY]. - .............[NO] [NO]....... [OKAY]....... - [OKAY] -[NO] .............. [OKAY]stochastic_transformer[OKAY] stochastic_transformer - -[OKAY] -[NO]. stochastic_transformer[NO]....... stochastic_transformer.......[OKAY]. -sparse_attn ............sparse_attn [NO]............ .......sparse_attn[NO] sparse_attn [OKAY]................... -............ [OKAY][NO] - . [NO]. stochastic_transformer.......[NO]stochastic_transformer ....... [OKAY]. - .[OKAY][NO] - [OKAY].[NO] - .......[NO] [OKAY] -[NO] transformertransformer ................... ....... ............ [NO][OKAY][OKAY] - [NO]....... .......[OKAY] -[OKAY] -....... [OKAY] - .......[NO] - transformer ....... [OKAY] transformer -............[OKAY] ............ -stochastic_transformer[NO] stochastic_transformer[NO] ........ . ....... [NO][OKAY] -[NO] [OKAY] .............. -stochastic_transformer [OKAY][OKAY] - -.stochastic_transformer [NO] ........ [NO][OKAY] -....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference utils.. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -utils quantizer.................. ..............[NO] [NO] .............. [OKAY][OKAY] - -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................................................................ installedinstalledinstalledinstalled .... .. .. compatiblecompatible compatible - -compatible - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam ...............cpu_adam ............... ...............[NO]............... [NO][NO].......[NO] ..............[OKAY]....... - [OKAY][OKAY][OKAY] - - -fused_adam .............fused_adamfused_adam fused_adam[NO] ............. ............. .................... [NO] [NO][OKAY] [NO] - ..................... [OKAY][OKAY][OKAY] - -fused_lamb - ............. [NO]fused_lambfused_lamb fused_lamb ....... .......................... ............. [OKAY][NO][NO][NO] - ..................... [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO]sparse_attn .......sparse_attnsparse_attn ............ [OKAY]............ ............ - [NO] [NO] [NO] ..............transformer .......[OKAY][OKAY]............ - - [OKAY][NO] - transformertransformer....... ........................[OKAY] transformer -[NO] [NO] ............ ....... ....... [NO] stochastic_transformer [OKAY][OKAY] ....... - - .[OKAY] -[NO] stochastic_transformer.......stochastic_transformer stochastic_transformer[OKAY] -.. [NO][NO]. ..............[NO] [OKAY][OKAY]....... - - [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY] [OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -op nameop name op nameop name ................ ................ ................installed ................ installedinstalled .. installed ....compatible -compatible..--------------------------------------------------compatible - - ---------------------------------------------------compatible --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... cpu_adamcpu_adam[NO] cpu_adam ............................................. ....... [NO] [NO][NO] [OKAY] ....... -....... ....... [OKAY] [OKAY] -[OKAY] - -fused_adam .............fused_adam fused_adamfused_adam[NO] .......................... ............. ....... [NO][NO] [NO] [OKAY] .............. -....... [OKAY][OKAY][OKAY] - -fused_lamb - ............. [NO]fused_lambfused_lamb fused_lamb .............................................. [OKAY][NO][NO] - [NO] ....... ....... ....... [OKAY] [OKAY] -[OKAY] - -sparse_attn ............ [NO] sparse_attn.......sparse_attn sparse_attn [OKAY] -.................................... [NO][NO][NO]transformer ................................. [NO] [OKAY][OKAY] -.......[OKAY] - -transformer[OKAY] - transformertransformer............ ........................[NO] stochastic_transformer[NO] [NO] ....... . ..............[OKAY] [OKAY][NO] - -[OKAY] -.......stochastic_transformer stochastic_transformer [OKAY] -stochastic_transformer. .[NO] .[NO]....... [NO].......[OKAY] -.......[OKAY] -[OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] async_io....... [NO]............... - [NO] ....... [NO] -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utils utils.................. ..................[NO] [NO]....... ....... [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaJIT compiled ops requires ninja - - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. ......................................................[OKAY] -[OKAY] [OKAY][OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -op nameop name op name op name................ ................ ................installedinstalled................ installed .... installed .. compatible .. -compatible -compatible ---------------------------------------------------------------------------------------------------- -compatible --------------------------------------------------- - - --------------------------------------------------- -cpu_adamcpu_adam ...............cpu_adam............... [NO]...............[NO] [NO]..............cpu_adam [OKAY][OKAY]....... - - [OKAY] -............... [NO] ....... [OKAY] -fused_adam fused_adam............. .............[NO]fused_adam [NO].................... .......[OKAY][NO] - [OKAY]....... - fused_lamb[OKAY] -............. fused_adam[NO]fused_lambfused_lamb ............. ................................. [NO][OKAY][NO] [NO]....... -....... .......[OKAY][OKAY] -[OKAY] - -fused_lamb ............. sparse_attn[NO] ...................sparse_attn sparse_attn [NO] ............ ....... [OKAY]............[NO] - [OKAY][NO]....... - .......[OKAY] -transformer[OKAY] -............transformer [NO]............transformer .......[NO]............ [OKAY].......[NO] - [OKAY]....... - [OKAY] -stochastic_transformersparse_attn stochastic_transformer stochastic_transformer............ . . .[NO][NO] [NO] .......[NO] ..................... [OKAY][OKAY][OKAY] - - -[OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found................ - [NO] ....... [NO] -async_io ............... [NO] transformer_inference ......... [NO][NO] - ....... [OKAY] -utils .................. [NO] ....... transformer_inference[OKAY] - .. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] -utils .................. [NO]-------------------------------------------------- -....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report----------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY] -[OKAY] -[OKAY]-------------------------------------------------- - - --------------------------------------------------- ---------------------------------------------------op name-------------------------------------------------- -op name - ................op name................op name installedinstalled .................. ................ .. compatible installedinstalled -compatible -..--------------------------------------------------..-------------------------------------------------- - - compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO]cpu_adam[NO] cpu_adam ....... .....................................[OKAY] - [NO][OKAY][NO] - .............. [OKAY][OKAY] - -fused_adam ............. fused_adam[NO] .................... fused_adamfused_adam[NO] [OKAY] ............. -....... ............. [NO] [OKAY]fused_lamb - [NO] .......fused_lamb ....................[OKAY]............. - [NO][OKAY][NO] - fused_lamb .............. fused_lamb.............[OKAY][OKAY] -............. -[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ sparse_attn[NO] ................... [NO]sparse_attn[OKAY]sparse_attn - .......transformer........................ [NO] [OKAY]............ - [NO].......[NO]transformer [OKAY].......................... - [NO] [OKAY][OKAY] -transformer....... - ............[OKAY]stochastic_transformertransformer - [NO]............. stochastic_transformer....... [NO] [NO][OKAY]....... - . [OKAY] ....... -stochastic_transformer[NO] [OKAY]........ - [OKAY][NO] - stochastic_transformer....... [OKAY] -. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja .................................... .................................... [OKAY][OKAY][OKAY] -[OKAY] - - --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - -op name - op nameop nameop name................ ................ ................................installed installed installedinstalled .... .. compatible.. -compatiblecompatible -------------------------------------------------- - - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [NO]cpu_adam cpu_adam .......cpu_adam ............... ............... [OKAY] -...............[NO][NO] [NO].............. .......[OKAY][OKAY] - -[OKAY] -fused_adam ............. [NO] ....... fused_adamfused_adam[OKAY] -.............fused_adam............. [NO].............fused_lamb[NO] ....................[NO]....... [OKAY] .......[NO] - [OKAY] -.......[OKAY]fused_lamb - [OKAY]fused_lamb............. - fused_lamb.............[NO] .............[NO]....... [NO].......[OKAY] -.......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformersparse_attn sparse_attn ............sparse_attn ............ ............[NO][NO] ............ ....... [OKAY] [NO].......[NO] - .......[OKAY]transformer....... - ............ [OKAY] stochastic_transformer[OKAY] - -[NO] ........transformertransformer [NO] [OKAY]............ -............ ....... [NO] stochastic_transformer[NO][OKAY] - .............. . [OKAY] [OKAY] - -[NO] .......stochastic_transformer [OKAY]stochastic_transformer - . [NO] ........ [NO][OKAY] -....... [OKAY] -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -JIT compiled ops requires ninja--------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................. .................. [OKAY]..................[OKAY] - -[OKAY][OKAY] ----------------------------------------------------------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op nameop name - - op name................................ op name installed ................ installed................installed.. installedcompatible.... - --------------------------------------------------..compatible -compatible - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [NO] cpu_adamcpu_adam cpu_adam.............................. ...............[NO][NO]....... [NO][OKAY]....... -....... [OKAY]....... - [OKAY][OKAY] - -fused_adamfused_adam .......................... fused_adam[NO]fused_adam .............[NO].................... [NO].......[OKAY][NO] - [OKAY]..............fused_lamb - [OKAY].............[OKAY] -fused_lamb[NO] - .............fused_lamb....... [OKAY][NO]fused_lamb............. -....... ............. [NO][OKAY] -[NO]....... .......[OKAY] - [OKAY] -sparse_attnsparse_attn ........................ [NO][NO]sparse_attnsparse_attn ...................................... [OKAY] [OKAY] -[NO][NO] -.......transformer....... transformer ............ [OKAY][OKAY]............ -[NO] - [NO].......transformer transformer ....... ............[OKAY]............ - [NO][OKAY]stochastic_transformer[NO] - ............... [OKAY][OKAY]stochastic_transformer[NO] - - ....... .[OKAY] -stochastic_transformer [NO]stochastic_transformer . ....... .[NO][OKAY] - [NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja-------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY] [OKAY] -[OKAY][OKAY] --------------------------------------------------- - --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------op name - op nameop name -................ ................ op nameinstalled................ installed.................. installed compatible.. -installed --------------------------------------------------.. compatible -.. -compatible --------------------------------------------------compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [NO] ....... cpu_adam[OKAY] -...............cpu_adam cpu_adam [NO] ............... ......................[NO] [OKAY][NO]....... -fused_adam .......[OKAY]............. [OKAY] -[NO] - ....... [OKAY] -fused_adam ............. [NO]fused_lamb ....................fused_adam fused_adam [OKAY] [NO]............. ............. - .......[NO] fused_lamb[NO][OKAY] ....... -............. ....... [OKAY] [NO] -[OKAY] -.......fused_lamb [OKAY]fused_lamb - ..........................sparse_attn [NO][NO]............ .......[NO]....... [OKAY].......[OKAY] - -[OKAY]sparse_attn - ............ transformer[NO] ................... [NO][OKAY] -....... transformer[OKAY]sparse_attn sparse_attn - ............ ........................[NO] stochastic_transformer [NO].......[NO] .[OKAY]....... - .......[NO][OKAY] stochastic_transformer - .......[OKAY] . -[OKAY]transformer -transformer [NO] ............ ................... [NO][OKAY][NO] - ....... .......[OKAY] [OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -> setting tensorboard ... - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -async_io-------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -transformer_inference .. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] -utils .................. --------------------------------------------------[NO] - ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -DeepSpeed general environment info: -transformer_inference async_io.. [NO]............... .......[NO] [OKAY]....... - [NO] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -utils .................. [NO] ....... [OKAY] -torch version .................... 1.8.1 -transformer_inference quantizer.. ..............[NO] [NO]....... .......[OKAY] -[OKAY] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -utils --------------------------------------------------.................. -[NO] ....... [OKAY] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -quantizer .............. [NO] ....... [OKAY] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizer ..............quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [NO][NO] .............. [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version ............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 - 11.2deepspeed install path - ...........deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, masterdeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja - - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - [WARNING]  async_io: please install the libaio-devel package with yum -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -............... [NO] ....... [NO] -async_io ............... [NO]transformer_inference ......... [NO][NO] - ....... [OKAY] -utils .................. [NO] ....... transformer_inference[OKAY] -.. [NO] .......quantizer ..............[OKAY] -[NO] ....... [OKAY]utils - .................. [NO] --------------------------------------------------....... - [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name -op nameop nameop name ................................................................ installedinstalledinstalled installed.... ..compatiblecompatible - -..compatible---------------------------------------------------------------------------------------------------- - - -compatible --------------------------------------------------- --------------------------------------------------- -cpu_adamcpu_adam .............................. cpu_adam[NO][NO] ............................. cpu_adam[NO][OKAY][OKAY] - - ...................... [OKAY][NO] -....... fused_adam[OKAY] fused_adam .......................... [NO] -[NO] .............. [OKAY]fused_adam[OKAY] - -............. [NO]fused_lamb fused_lamb ....... ............. ............. [OKAY] [NO] -[NO] ..............fused_adam fused_lamb[OKAY] -............. [OKAY][NO]............. - .......[NO] [OKAY] -.......sparse_attn [OKAY]............ -sparse_attn [NO]............ .......[NO] [OKAY]fused_lamb -.......sparse_attn [OKAY]transformer............ - .........................[NO] transformer[NO].......[NO] ............[OKAY]....... - .......[OKAY][NO] - transformer[OKAY]....... - stochastic_transformer............[OKAY] -[NO]. .......[NO]stochastic_transformer .......[OKAY] -.[OKAY] -[NO]stochastic_transformersparse_attn ....... .[OKAY] -[NO] ....... [OKAY]............ - [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] .......async_io [NO] - ............... [NO] ....... [NO] -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY]utils - .................. [NO] utils....... ..................[OKAY] -[NO] ....... [OKAY]quantizer - .............. [NO] .......quantizer [OKAY].............. - [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install pathDeepSpeed general environment info: ............... - torch install path ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']............... - torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch cuda version ...............torch version 11.1.................... - nvcc version1.8.1 -..................... torch cuda version11.2 -...............deepspeed install path 11.1........... - nvcc version .....................['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -11.2deepspeed info - deepspeed install path................... ...........0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']...... -torch 1.8, cuda 11.1 -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1762246.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 150 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.006 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 0.0001 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 3750000 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_bnb_optimizer ............................... False - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found........ - [NO] -async_iotransformer_inference .. ...............[NO] [NO]....... .......[OKAY] -[NO] -utils .................. [NO] ....... [OKAY] -transformer_inferencequantizer ................ [NO][NO] .............. [OKAY][OKAY] - ---------------------------------------------------utils - .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum-------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... -0.5.5+29bee73, 29bee73, master -deepspeed info deepspeed wheel compiled w.................... ......0.5.5+29bee73, 29bee73, master -torch 1.8, cuda 11.1deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.5.5+29bee73, 29bee73, master0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-29 13:52:48,393] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' ->>> done with dataset index builder. Compilation time: 0.306 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 18.324 seconds -time to initialize megatron (seconds): 35.090 -[after megatron is initialized] datetime: 2021-10-29 13:53:07 -building GPT model ... -[2021-10-29 13:53:07,156] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-29 13:53:07,156] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-29 13:53:07,157] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.12 GB, percent = 22.0% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-29 13:53:08,829] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 -[2021-10-29 13:53:09,528] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-29 13:53:09,528] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-29 13:53:09,528] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.29 GB, percent = 22.1% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-29 13:53:09,529] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+29bee73, git-hash=29bee73, git-branch=master -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-29 13:53:09,566] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-29 13:53:09,566] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-29 13:53:09,566] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-29 13:53:09,567] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-29 13:53:09,567] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-29 13:53:09,567] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-29 13:53:09,567] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-29 13:53:09,567] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-29 13:53:09,567] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-29 13:53:09,567] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 1.3788464069366455 seconds -Time to load utils op: 1.3961207866668701 seconds -Time to load utils op: 1.3958027362823486 seconds -Time to load utils op: 1.3972172737121582 seconds -Time to load utils op: 1.3952741622924805 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Time to load utils op: 1.456477403640747 secondsTime to load utils op: 1.4635021686553955 seconds - - -Time to load utils op: 1.4536137580871582 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Time to load utils op: 1.4603271484375 secondsTime to load utils op: 1.4599528312683105 seconds - -Time to load utils op: 1.464724063873291 secondsTime to load utils op: 1.4597244262695312 seconds - -Time to load utils op: 1.4647417068481445 secondsTime to load utils op: 1.463550329208374 secondsTime to load utils op: 1.4643354415893555 seconds - - -Time to load utils op: 1.4650459289550781 seconds -Time to load utils op: 1.4564142227172852 secondsTime to load utils op: 1.456678867340088 seconds - -Time to load utils op: 1.4603781700134277 seconds -Time to load utils op: 1.4568758010864258 seconds -Time to load utils op: 1.4574627876281738 seconds -Time to load utils op: 1.461463212966919 secondsTime to load utils op: 1.4617679119110107 seconds - -Time to load utils op: 1.4679367542266846 seconds -Time to load utils op: 1.4632351398468018 seconds -Time to load utils op: 1.4701602458953857 seconds -Time to load utils op: 1.4671053886413574 seconds -Time to load utils op: 1.467750072479248 seconds -Time to load utils op: 1.4605062007904053 secondsTime to load utils op: 1.470628261566162 secondsTime to load utils op: 1.4621376991271973 seconds - - -Time to load utils op: 1.4624214172363281 seconds -Time to load utils op: 1.466348648071289 secondsTime to load utils op: 1.4650287628173828 seconds -Time to load utils op: 1.4658851623535156 seconds -Time to load utils op: 1.4660804271697998 seconds - -Time to load utils op: 1.4718372821807861 seconds -Time to load utils op: 1.4637553691864014 seconds -Time to load utils op: 1.4631328582763672 seconds -Time to load utils op: 1.464020013809204 seconds -Time to load utils op: 1.4611501693725586 seconds -Time to load utils op: 1.463836431503296 seconds -Time to load utils op: 1.4640941619873047 secondsTime to load utils op: 1.4645428657531738 seconds - -Time to load utils op: 1.465444803237915 secondsTime to load utils op: 1.4657416343688965 seconds - -Time to load utils op: 1.4675631523132324 seconds -Time to load utils op: 1.4678244590759277 seconds -Time to load utils op: 1.4682633876800537 secondsTime to load utils op: 1.4663982391357422 seconds - -Time to load utils op: 1.464933156967163 seconds -Time to load utils op: 1.4669287204742432 seconds -Time to load utils op: 1.4731192588806152 seconds -Time to load utils op: 1.4697966575622559 seconds -Time to load utils op: 1.477635383605957 seconds -Time to load utils op: 1.469104290008545 seconds -Time to load utils op: 1.476625919342041 secondsTime to load utils op: 1.4712910652160645 secondsTime to load utils op: 1.477398157119751 seconds - - -Time to load utils op: 1.4767341613769531 seconds -Time to load utils op: 1.4784224033355713 secondsTime to load utils op: 1.4737792015075684 secondsTime to load utils op: 1.4735617637634277 secondsTime to load utils op: 1.4795680046081543 seconds - - - -Time to load utils op: 1.4709875583648682 seconds -Time to load utils op: 1.477074146270752 seconds -Time to load utils op: 1.467409610748291 seconds -Time to load utils op: 1.4739937782287598 seconds -Time to load utils op: 1.4684910774230957 secondsTime to load utils op: 1.4688620567321777 secondsTime to load utils op: 1.4674313068389893 seconds -Time to load utils op: 1.4685897827148438 seconds - - -Time to load utils op: 1.467064380645752 secondsTime to load utils op: 1.4663519859313965 secondsTime to load utils op: 1.4662392139434814 seconds - -Time to load utils op: 1.4673669338226318 seconds - -Time to load utils op: 1.4775428771972656 seconds -Time to load utils op: 1.4770960807800293 seconds -Time to load utils op: 1.4805335998535156 seconds -Time to load utils op: 1.482527732849121 seconds -Time to load utils op: 1.3684720993041992 seconds -Time to load utils op: 1.3898346424102783 seconds -Time to load utils op: 1.385185956954956 seconds -Time to load utils op: 1.388467788696289 seconds -Time to load utils op: 1.4751856327056885 seconds -Time to load utils op: 1.4745752811431885 seconds -Time to load utils op: 1.474623441696167 secondsTime to load utils op: 1.4756970405578613 seconds - -Time to load utils op: 1.480370283126831 secondsTime to load utils op: 1.4884741306304932 seconds - -Time to load utils op: 1.4770960807800293 seconds -Time to load utils op: 1.480043649673462 seconds -Time to load utils op: 1.4834351539611816 secondsTime to load utils op: 1.4832477569580078 secondsTime to load utils op: 1.4817557334899902 seconds - - -Time to load utils op: 1.4825944900512695 seconds -Time to load utils op: 1.4932360649108887 secondsTime to load utils op: 1.4891023635864258 seconds -Time to load utils op: 1.4905002117156982 seconds - -Time to load utils op: 1.4861783981323242 seconds -Time to load utils op: 1.4781064987182617 secondsTime to load utils op: 1.478147029876709 secondsTime to load utils op: 1.4771924018859863 secondsTime to load utils op: 1.4781687259674072 seconds - - - -Time to load utils op: 1.483057975769043 seconds -Time to load utils op: 1.4815571308135986 secondsTime to load utils op: 1.4836108684539795 seconds - -Time to load utils op: 1.4828636646270752 seconds -Time to load utils op: 1.4919242858886719 seconds -Time to load utils op: 1.4926140308380127 seconds -Time to load utils op: 1.4912216663360596 seconds -Time to load utils op: 1.4920878410339355 seconds -Time to load utils op: 1.4026963710784912 seconds -Time to load utils op: 1.4063599109649658 seconds -Time to load utils op: 1.39931058883667 seconds -Time to load utils op: 1.4051525592803955 seconds -Time to load utils op: 1.499002456665039 secondsTime to load utils op: 1.493412971496582 secondsTime to load utils op: 1.4930109977722168 seconds - -Time to load utils op: 1.4933907985687256 seconds - -Time to load utils op: 1.5046932697296143 secondsTime to load utils op: 1.4995758533477783 seconds - -Time to load utils op: 1.502504587173462 seconds -Time to load utils op: 1.5041553974151611 seconds -Time to load utils op: 1.5034735202789307 secondsTime to load utils op: 1.5101954936981201 secondsTime to load utils op: 1.501617193222046 seconds - -Time to load utils op: 1.5089497566223145 seconds - -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0023164749145507812 seconds -Time to load utils op: 0.0022687911987304688 seconds -Time to load utils op: 0.0021543502807617188 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Time to load utils op: 0.0014407634735107422 seconds -Time to load utils op: 0.001489877700805664 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001386404037475586 seconds -Time to load utils op: 0.0013043880462646484 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-29 13:53:13,617] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001306772232055664 secondsTime to load utils op: 0.0010852813720703125 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.000978231430053711 seconds -Time to load utils op: 0.0010449886322021484 seconds -Loading extension module utils... -Time to load utils op: 0.0012969970703125 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009572505950927734 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010180473327636719 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011713504791259766 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013048648834228516 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0014293193817138672 seconds -Time to load utils op: 0.0012688636779785156 seconds -Time to load utils op: 0.00101470947265625 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010213851928710938 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -[2021-10-29 13:53:13,617] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -Time to load utils op: 0.001207113265991211 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.0013089179992675781 seconds -Loading extension module utils... -Time to load utils op: 0.0010006427764892578 seconds -Loading extension module utils... -Time to load utils op: 0.0013172626495361328 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils...Loading extension module utils... - -Time to load utils op: 0.0009663105010986328 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011897087097167969 seconds -Time to load utils op: 0.0009989738464355469 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011620521545410156 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010917186737060547 seconds -Time to load utils op: 0.0013263225555419922 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009913444519042969 seconds -Time to load utils op: 0.0012812614440917969 seconds -Time to load utils op: 0.0010793209075927734 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010371208190917969 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -[2021-10-29 13:53:13,618] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.31 GB, percent = 22.1% -Time to load utils op: 0.0010058879852294922 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009584426879882812 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013861656188964844 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.0012545585632324219 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010418891906738281 secondsTime to load utils op: 0.0011408329010009766 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.00110626220703125 secondsTime to load utils op: 0.0010571479797363281 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010166168212890625 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010943412780761719 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012009143829345703 secondsTime to load utils op: 0.0011582374572753906 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012176036834716797 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013327598571777344 seconds -Time to load utils op: 0.0012466907501220703 seconds -Time to load utils op: 0.001140594482421875 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010612010955810547 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012896060943603516 seconds -Time to load utils op: 0.0011582374572753906 seconds -Time to load utils op: 0.0010640621185302734 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010099411010742188 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011491775512695312 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0014710426330566406 secondsTime to load utils op: 0.0012154579162597656 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.0011682510375976562 seconds -Time to load utils op: 0.0012369155883789062 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009703636169433594 seconds -Time to load utils op: 0.0012538433074951172 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011360645294189453 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001470327377319336 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0014429092407226562 seconds -Time to load utils op: 0.0014712810516357422 seconds -Time to load utils op: 0.0011126995086669922 seconds -Time to load utils op: 0.00106048583984375 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0014233589172363281 seconds -Loading extension module utils... -Time to load utils op: 0.0012881755828857422 seconds -Loading extension module utils... -Time to load utils op: 0.0013566017150878906 seconds -Time to load utils op: 0.0013661384582519531 seconds -Time to load utils op: 0.00140380859375 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.001333475112915039 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001413106918334961 seconds -Time to load utils op: 0.0012819766998291016 seconds -Time to load utils op: 0.0012476444244384766 seconds -Time to load utils op: 0.000993490219116211 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012040138244628906 seconds -Time to load utils op: 0.0010232925415039062 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.00098419189453125 seconds -Time to load utils op: 0.0012621879577636719 seconds -Time to load utils op: 0.0009925365447998047 seconds -Loading extension module utils... -Time to load utils op: 0.0013103485107421875 seconds -Time to load utils op: 0.00133514404296875 seconds -Time to load utils op: 0.0012521743774414062 seconds -Time to load utils op: 0.0010395050048828125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011184215545654297 seconds -Time to load utils op: 0.0012717247009277344 seconds -Time to load utils op: 0.001081228256225586 seconds -Time to load utils op: 0.0009937286376953125 seconds -Time to load utils op: 0.0011150836944580078 seconds -Time to load utils op: 0.0011658668518066406 seconds -Time to load utils op: 0.0013098716735839844 secondsTime to load utils op: 0.0015020370483398438 seconds - -Time to load utils op: 0.0009541511535644531 seconds -Time to load utils op: 0.0011272430419921875 seconds -Time to load utils op: 0.0011050701141357422 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.001333475112915039 seconds -Time to load utils op: 0.0012733936309814453 seconds -Time to load utils op: 0.0011448860168457031 seconds -Time to load utils op: 0.0010495185852050781 seconds -Loading extension module utils... -Time to load utils op: 0.0012247562408447266 seconds -Time to load utils op: 0.00104522705078125 seconds -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010783672332763672 seconds -Loading extension module utils... -Time to load utils op: 0.0012476444244384766 seconds -Time to load utils op: 0.0011775493621826172 seconds -Time to load utils op: 0.001062631607055664 seconds -Time to load utils op: 0.001140594482421875 seconds -Time to load utils op: 0.0012776851654052734 seconds -Time to load utils op: 0.0011360645294189453 seconds -Time to load utils op: 0.0012860298156738281 seconds -Time to load utils op: 0.0010857582092285156 seconds -Time to load utils op: 0.0011754035949707031 seconds -Time to load utils op: 0.0010530948638916016 seconds -Time to load utils op: 0.0013790130615234375 secondsTime to load utils op: 0.0012135505676269531 seconds - -Time to load utils op: 0.001331329345703125 seconds -Time to load utils op: 0.0012831687927246094 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.001346588134765625 seconds -Time to load utils op: 0.001153707504272461 seconds -Time to load utils op: 0.0013692378997802734 secondsTime to load utils op: 0.0012359619140625 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...Time to load utils op: 0.0012407302856445312 seconds - -Time to load utils op: 0.0010151863098144531 seconds -Time to load utils op: 0.0010123252868652344 seconds -Time to load utils op: 0.0010688304901123047 seconds -[2021-10-29 13:53:13,667] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-29 13:53:13,668] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-29 13:53:13,668] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.31 GB, percent = 22.1% -[2021-10-29 13:53:13,668] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-29 13:53:13,700] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-29 13:53:13,700] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-29 13:53:13,701] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.31 GB, percent = 22.1% -[2021-10-29 13:53:13,701] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-29 13:53:13,701] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-29 13:53:13,701] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-29 13:53:13,701] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-29 13:53:13,701] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-29 13:53:13,701] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-29 13:53:13,701] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-29 13:53:13,701] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-29 13:53:13,701] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-29 13:53:13,701] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-29 13:53:13,701] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-29 13:53:13,702] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-29 13:53:13,703] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-29 13:53:13,703] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0007760524749755859 seconds -[2021-10-29 13:53:13,704] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-29 13:53:14,028] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) - > using checkpoint value 0.0001 for learning rate - > using checkpoint value 6e-06 for minimum learning rate - > using checkpoint value 3750000 for warmup iterations - > using checkpoint value 600000000 for total number of iterations - > using checkpoint value cosine for decay style -successfully loaded 1 ZeRO state_dicts for rank 36 -successfully loaded 1 ZeRO state_dicts for rank 120 -successfully loaded 1 ZeRO state_dicts for rank 51 -successfully loaded 1 ZeRO state_dicts for rank 28 -successfully loaded 1 ZeRO state_dicts for rank 39 -successfully loaded 1 ZeRO state_dicts for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 122 -successfully loaded 1 ZeRO state_dicts for rank 16 -successfully loaded 1 ZeRO state_dicts for rank 38 -successfully loaded 1 ZeRO state_dicts for rank 98 -successfully loaded 1 ZeRO state_dicts for rank 37 -successfully loaded 1 ZeRO state_dicts for rank 79 -successfully loaded 1 ZeRO state_dicts for rank 77 -successfully loaded 1 ZeRO state_dicts for rank 96 -successfully loaded 1 ZeRO state_dicts for rank 113 -successfully loaded 1 ZeRO state_dicts for rank 121 -successfully loaded 1 ZeRO state_dicts for rank 48 -successfully loaded 1 ZeRO state_dicts for rank 112 -successfully loaded 1 ZeRO state_dicts for rank 76 -successfully loaded 1 ZeRO state_dicts for rank 123 -successfully loaded 1 ZeRO state_dicts for rank 29 -successfully loaded 1 ZeRO state_dicts for rank 46 -successfully loaded 1 ZeRO state_dicts for rank 97 -loading 1 zero partition checkpoints for rank 120 -successfully loaded 1 ZeRO state_dicts for rank 115 -successfully loaded 1 ZeRO state_dicts for rank 103 -successfully loaded 1 ZeRO state_dicts for rank 114 -successfully loaded 1 ZeRO state_dicts for rank 53 -successfully loaded 1 ZeRO state_dicts for rank 49 -successfully loaded 1 ZeRO state_dicts for rank 72 -successfully loaded 1 ZeRO state_dicts for rank 52 -loading 1 zero partition checkpoints for rank 51 -successfully loaded 1 ZeRO state_dicts for rank 78 -successfully loaded 1 ZeRO state_dicts for rank 107 -successfully loaded 1 ZeRO state_dicts for rank 70 -successfully loaded 1 ZeRO state_dicts for rank 71 -successfully loaded 1 ZeRO state_dicts for rank 17 -successfully loaded 1 ZeRO state_dicts for rank 56 -successfully loaded 1 ZeRO state_dicts for rank 106 -successfully loaded 1 ZeRO state_dicts for rank 74 -successfully loaded 1 ZeRO state_dicts for rank 66 -successfully loaded 1 ZeRO state_dicts for rank 105 -loading 1 zero partition checkpoints for rank 39 -successfully loaded 1 ZeRO state_dicts for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 102 -successfully loaded 1 ZeRO state_dicts for rank 18 -successfully loaded 1 ZeRO state_dicts for rank 19 -loading 1 zero partition checkpoints for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 108 -successfully loaded 1 ZeRO state_dicts for rank 109 -successfully loaded 1 ZeRO state_dicts for rank 50 -successfully loaded 1 ZeRO state_dicts for rank 54 -successfully loaded 1 ZeRO state_dicts for rank 73 -successfully loaded 1 ZeRO state_dicts for rank 57 -successfully loaded 1 ZeRO state_dicts for rank 110 -loading 1 zero partition checkpoints for rank 36 -loading 1 zero partition checkpoints for rank 38 -successfully loaded 1 ZeRO state_dicts for rank 93 -successfully loaded 1 ZeRO state_dicts for rank 65 -loading 1 zero partition checkpoints for rank 98 -successfully loaded 1 ZeRO state_dicts for rank 84 -successfully loaded 1 ZeRO state_dicts for rank 47 -successfully loaded 1 ZeRO state_dicts for rank 55 -successfully loaded 1 ZeRO state_dicts for rank 69 -successfully loaded 1 ZeRO state_dicts for rank 81 -successfully loaded 1 ZeRO state_dicts for rank 64 -loading 1 zero partition checkpoints for rank 76 -successfully loaded 1 ZeRO state_dicts for rank 82 -loading 1 zero partition checkpoints for rank 28 -successfully loaded 1 ZeRO state_dicts for rank 8 -successfully loaded 1 ZeRO state_dicts for rank 101 -loading 1 zero partition checkpoints for rank 113 -successfully loaded 1 ZeRO state_dicts for rank 20 -successfully loaded 1 ZeRO state_dicts for rank 95 -successfully loaded 1 ZeRO state_dicts for rank 92 -successfully loaded 1 ZeRO state_dicts for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 9 -successfully loaded 1 ZeRO state_dicts for rank 22 -successfully loaded 1 ZeRO state_dicts for rank 111 -successfully loaded 1 ZeRO state_dicts for rank 45 -successfully loaded 1 ZeRO state_dicts for rank 21 -successfully loaded 1 ZeRO state_dicts for rank 91 -successfully loaded 1 ZeRO state_dicts for rank 26 -successfully loaded 1 ZeRO state_dicts for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 83 -successfully loaded 1 ZeRO state_dicts for rank 25 -successfully loaded 1 ZeRO state_dicts for rank 24 -successfully loaded 1 ZeRO state_dicts for rank 87 -successfully loaded 1 ZeRO state_dicts for rank 43 -loading 1 zero partition checkpoints for rank 122 -successfully loaded 1 ZeRO state_dicts for rank 85 -successfully loaded 1 ZeRO state_dicts for rank 80 -successfully loaded 1 ZeRO state_dicts for rank 58 -successfully loaded 1 ZeRO state_dicts for rank 104 -loading 1 zero partition checkpoints for rank 121 -successfully loaded 1 ZeRO state_dicts for rank 62 -successfully loaded 1 ZeRO state_dicts for rank 61 -successfully loaded 1 ZeRO state_dicts for rank 31 -loading 1 zero partition checkpoints for rank 16 -loading 1 zero partition checkpoints for rank 114 -loading 1 zero partition checkpoints for rank 37 -successfully loaded 1 ZeRO state_dicts for rank 59 -successfully loaded 1 ZeRO state_dicts for rank 94 -successfully loaded 1 ZeRO state_dicts for rank 30 -successfully loaded 1 ZeRO state_dicts for rank 63 -loading 1 zero partition checkpoints for rank 52 -loading 1 zero partition checkpoints for rank 49 -loading 1 zero partition checkpoints for rank 78 -loading 1 zero partition checkpoints for rank 107 -successfully loaded 1 ZeRO state_dicts for rank 90 -successfully loaded 1 ZeRO state_dicts for rank 44 -loading 1 zero partition checkpoints for rank 79 -loading 1 zero partition checkpoints for rank 17 -successfully loaded 1 ZeRO state_dicts for rank 118 -loading 1 zero partition checkpoints for rank 77 -loading 1 zero partition checkpoints for rank 96 -successfully loaded 1 ZeRO state_dicts for rank 119 -successfully loaded 1 ZeRO state_dicts for rank 27 -successfully loaded 1 ZeRO state_dicts for rank 86 -loading 1 zero partition checkpoints for rank 48 -loading 1 zero partition checkpoints for rank 112 -loading 1 zero partition checkpoints for rank 102 -loading 1 zero partition checkpoints for rank 18 -loading 1 zero partition checkpoints for rank 70 -loading 1 zero partition checkpoints for rank 71 -loading 1 zero partition checkpoints for rank 29 -loading 1 zero partition checkpoints for rank 46 -loading 1 zero partition checkpoints for rank 123 -successfully loaded 1 ZeRO state_dicts for rank 75 -loading 1 zero partition checkpoints for rank 97 -loading 1 zero partition checkpoints for rank 54 -loading 1 zero partition checkpoints for rank 109 -loading 1 zero partition checkpoints for rank 73 -successfully loaded 1 ZeRO state_dicts for rank 88 -loading 1 zero partition checkpoints for rank 66 -successfully loaded 1 ZeRO state_dicts for rank 10 -successfully loaded 1 ZeRO state_dicts for rank 12 -loading 1 zero partition checkpoints for rank 115 -successfully loaded 1 ZeRO state_dicts for rank 89 -successfully loaded 1 ZeRO state_dicts for rank 32 -successfully loaded 1 ZeRO state_dicts for rank 23 -loading 1 zero partition checkpoints for rank 103 -successfully loaded 1 ZeRO state_dicts for rank 41 -successfully loaded 1 ZeRO state_dicts for rank 117 -successfully loaded 1 ZeRO state_dicts for rank 42 -loading 1 zero partition checkpoints for rank 53 -loading 1 zero partition checkpoints for rank 105 -loading 1 zero partition checkpoints for rank 72 -successfully loaded 1 ZeRO state_dicts for rank 124 -successfully loaded 1 ZeRO state_dicts for rank 11 -loading 1 zero partition checkpoints for rank 81 -successfully loaded 1 ZeRO state_dicts for rank 33 -loading 1 zero partition checkpoints for rank 82 -loading 1 zero partition checkpoints for rank 9 -loading 1 zero partition checkpoints for rank 111 -loading 1 zero partition checkpoints for rank 56 -successfully loaded 1 ZeRO state_dicts for rank 5 -loading 1 zero partition checkpoints for rank 106 -loading 1 zero partition checkpoints for rank 74 -loading 1 zero partition checkpoints for rank 20 -successfully loaded 1 ZeRO state_dicts for rank 125 -loading 1 zero partition checkpoints for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 40 -loading 1 zero partition checkpoints for rank 61 -loading 1 zero partition checkpoints for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 60 -successfully loaded 1 ZeRO state_dicts for rank 15 -loading 1 zero partition checkpoints for rank 45 -loading 1 zero partition checkpoints for rank 95 -successfully loaded 1 ZeRO state_dicts for rank 34 -loading 1 zero partition checkpoints for rank 50 -loading 1 zero partition checkpoints for rank 19 -loading 1 zero partition checkpoints for rank 108 -loading 1 zero partition checkpoints for rank 57 -loading 1 zero partition checkpoints for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 127 -loading 1 zero partition checkpoints for rank 110 -loading 1 zero partition checkpoints for rank 63 -loading 1 zero partition checkpoints for rank 58 -loading 1 zero partition checkpoints for rank 25 -loading 1 zero partition checkpoints for rank 21 -loading 1 zero partition checkpoints for rank 55 -successfully loaded 1 ZeRO state_dicts for rank 6 -loading 1 zero partition checkpoints for rank 24 -loading 1 zero partition checkpoints for rank 31 -loading 1 zero partition checkpoints for rank 65 -loading 1 zero partition checkpoints for rank 93 -loading 1 zero partition checkpoints for rank 87 -successfully loaded 1 ZeRO state_dicts for rank 13 -loading 1 zero partition checkpoints for rank 84 -successfully loaded 1 ZeRO state_dicts for rank 35 -loading 1 zero partition checkpoints for rank 59 -loading 1 zero partition checkpoints for rank 47 -loading 1 zero partition checkpoints for rank 91 -loading 1 zero partition checkpoints for rank 94 -loading 1 zero partition checkpoints for rank 69 -loading 1 zero partition checkpoints for rank 90 -loading 1 zero partition checkpoints for rank 30 -successfully loaded 1 ZeRO state_dicts for rank 4 -loading 1 zero partition checkpoints for rank 64 -loading 1 zero partition checkpoints for rank 8 -loading 1 zero partition checkpoints for rank 44 -loading 1 zero partition checkpoints for rank 101 -loading 1 zero partition checkpoints for rank 75 -loading 1 zero partition checkpoints for rank 92 -loading 1 zero partition checkpoints for rank 22 -loading 1 zero partition checkpoints for rank 118 -loading 1 zero partition checkpoints for rank 26 -loading 1 zero partition checkpoints for rank 119 -loading 1 zero partition checkpoints for rank 83 -loading 1 zero partition checkpoints for rank 86 -successfully loaded 1 ZeRO state_dicts for rank 116 -loading 1 zero partition checkpoints for rank 104 -loading 1 zero partition checkpoints for rank 43 -loading 1 zero partition checkpoints for rank 80 -loading 1 zero partition checkpoints for rank 32 -loading 1 zero partition checkpoints for rank 62 -successfully loaded 1 ZeRO state_dicts for rank 7 -loading 1 zero partition checkpoints for rank 85 -loading 1 zero partition checkpoints for rank 41 -successfully loaded 1 ZeRO state_dicts for rank 126 -successfully loaded 1 ZeRO state_dicts for rank 2 -loading 1 zero partition checkpoints for rank 27 -loading 1 zero partition checkpoints for rank 11 -loading 1 zero partition checkpoints for rank 124 -successfully loaded 1 ZeRO state_dicts for rank 0 -loading 1 zero partition checkpoints for rank 88 -loading 1 zero partition checkpoints for rank 40 -loading 1 zero partition checkpoints for rank 10 -loading 1 zero partition checkpoints for rank 12 -loading 1 zero partition checkpoints for rank 89 -loading 1 zero partition checkpoints for rank 23 -successfully loaded 1 ZeRO state_dicts for rank 14 -loading 1 zero partition checkpoints for rank 42 -loading 1 zero partition checkpoints for rank 117 -loading 1 zero partition checkpoints for rank 35 -loading 1 zero partition checkpoints for rank 4 -loading 1 zero partition checkpoints for rank 13 -loading 1 zero partition checkpoints for rank 33 -successfully loaded 1 ZeRO state_dicts for rank 1 -loading 1 zero partition checkpoints for rank 5 -loading 1 zero partition checkpoints for rank 60 -loading 1 zero partition checkpoints for rank 15 -loading 1 zero partition checkpoints for rank 34 -loading 1 zero partition checkpoints for rank 7 -successfully loaded 1 ZeRO state_dicts for rank 3 -loading 1 zero partition checkpoints for rank 125 -loading 1 zero partition checkpoints for rank 6 -loading 1 zero partition checkpoints for rank 127 -loading 1 zero partition checkpoints for rank 116 -loading 1 zero partition checkpoints for rank 126 -loading 1 zero partition checkpoints for rank 14 -loading 1 zero partition checkpoints for rank 2 -loading 1 zero partition checkpoints for rank 0 -loading 1 zero partition checkpoints for rank 3 -loading 1 zero partition checkpoints for rank 1 - checkpoint version 3.0 - successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints at iteration 2548 -time (ms) | load-checkpoint: 15496.77 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.2213504 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - - - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 125.22432 - -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-29 13:53:29 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 20008960 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.061958 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.166 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.195 seconds - total number of samples: 20781483 - total number of epochs: 3 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.009 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-29 13:53:35 -done with setup ... -training ... -time (ms) | model-and-optimizer-setup: 22532.16 | train/valid/test-data-iterators-setup: 5251.72 -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billionNumber of parameters: 125.2213504 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -[before the start of training step] datetime: 2021-10-29 13:53:35 -[2021-10-29 13:53:35,756] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-29 13:53:35,756] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-29 13:53:35,756] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-29 13:53:35,756] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-29 13:53:35,756] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -[Rank 3] (after 2549 iterations) memory (MB) | allocated: 13215.67529296875 | max allocated: 20679.22412109375 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 11] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 7] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 2] (after 2549 iterations) memory (MB) | allocated: 13215.67529296875 | max allocated: 20679.22412109375 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 0] (after 2549 iterations) memory (MB) | allocated: 13215.67529296875 | max allocated: 20679.22412109375 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 126] (after 2549 iterations) memory (MB) | allocated: 13108.01806640625 | max allocated: 20571.6240234375 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 10] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 6] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 4] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 124] (after 2549 iterations) memory (MB) | allocated: 13108.373046875 | max allocated: 20571.97900390625 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 16] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 8] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 12] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 125] (after 2549 iterations) memory (MB) | allocated: 13108.01806640625 | max allocated: 20571.6240234375 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 1] (after 2549 iterations) memory (MB) | allocated: 13215.67529296875 | max allocated: 20679.22412109375 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 5] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 9] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 13] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 17] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 21] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 25] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 29] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 33] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 37] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 41] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 24] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 28] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 49] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 36] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 45] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 32] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 20] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 44] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 40] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 48] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 19] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 18] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 14] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 15] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 57] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 56] (after 2549 iterations) memory (MB) | allocated: 10789.45751953125 | max allocated: 16949.63916015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 65] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 68] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0[Rank 69] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 - -[Rank 64] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 61] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 52] (after 2549 iterations) memory (MB) | allocated: 10789.45751953125 | max allocated: 16949.63916015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 53] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 60] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 77] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 76] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 73] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 72] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 80] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 89] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 85] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 84] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 81] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 96] (after 2549 iterations) memory (MB) | allocated: 10789.45751953125 | max allocated: 16949.63916015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 97] (after 2549 iterations) memory (MB) | allocated: 10789.34521484375 | max allocated: 16949.52685546875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 92] (after 2549 iterations) memory (MB) | allocated: 10789.34521484375 | max allocated: 16949.52685546875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 93] (after 2549 iterations) memory (MB) | allocated: 10788.7158203125 | max allocated: 16948.8974609375 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 101] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 100] (after 2549 iterations) memory (MB) | allocated: 10789.34521484375 | max allocated: 16949.52685546875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 113] (after 2549 iterations) memory (MB) | allocated: 10789.1298828125 | max allocated: 16949.3115234375 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 26] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 27] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 105] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0[Rank 104] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 - -[Rank 117] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 30] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 121] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0[Rank 120] (after 2549 iterations) memory (MB) | allocated: 10788.9775390625 | max allocated: 16949.1591796875 | reserved: 20072.0 | max reserved: 20072.0 - -[Rank 116] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 31] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 109] (after 2549 iterations) memory (MB) | allocated: 10789.1298828125 | max allocated: 16949.3115234375 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 34] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 22] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 39] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0[Rank 38] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 35] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 - -[Rank 88] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 43] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 42] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 46] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 23] (after 2549 iterations) memory (MB) | allocated: 10789.45751953125 | max allocated: 16949.63916015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 47] (after 2549 iterations) memory (MB) | allocated: 10789.30517578125 | max allocated: 16949.48681640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 51] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0[Rank 50] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 - -[Rank 54] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 55] (after 2549 iterations) memory (MB) | allocated: 10789.45751953125 | max allocated: 16949.63916015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 59] (after 2549 iterations) memory (MB) | allocated: 10789.45751953125 | max allocated: 16949.63916015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 62] (after 2549 iterations) memory (MB) | allocated: 10789.45751953125 | max allocated: 16949.63916015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 58] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 67] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 108] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 71] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 70] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 63] (after 2549 iterations) memory (MB) | allocated: 10789.45751953125 | max allocated: 16949.63916015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 66] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 112] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 75] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 74] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 78] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 82] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 79] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 83] (after 2549 iterations) memory (MB) | allocated: 10789.34521484375 | max allocated: 16949.52685546875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 87] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 86] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 91] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 90] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 95] (after 2549 iterations) memory (MB) | allocated: 10789.34521484375 | max allocated: 16949.52685546875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 99] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 94] (after 2549 iterations) memory (MB) | allocated: 10789.34521484375 | max allocated: 16949.52685546875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 98] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 102] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0[Rank 103] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 - -[Rank 106] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 107] (after 2549 iterations) memory (MB) | allocated: 10788.92578125 | max allocated: 16949.107421875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 111] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 110] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 115] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 114] (after 2549 iterations) memory (MB) | allocated: 10789.1298828125 | max allocated: 16949.3115234375 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 118] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 119] (after 2549 iterations) memory (MB) | allocated: 10789.1298828125 | max allocated: 16949.3115234375 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 122] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 123] (after 2549 iterations) memory (MB) | allocated: 10788.61865234375 | max allocated: 16948.80029296875 | reserved: 20072.0 | max reserved: 20072.0 - iteration 2549/ 292968 | consumed samples: 5220352 | consumed tokens: 680329216 | elapsed time per iteration (ms): 204001.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.715835E+00 | loss scale: 131072.0 | grad norm: 44932.500 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -[Rank 127] (after 2549 iterations) memory (MB) | allocated: 13109.1904296875 | max allocated: 20572.79638671875 | reserved: 24404.0 | max reserved: 24404.0 -time (ms) - iteration 2550/ 292968 | consumed samples: 5222400 | consumed tokens: 680738816 | elapsed time per iteration (ms): 129106.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.745429E+00 | loss scale: 131072.0 | grad norm: 84644.311 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 2550 | lm loss value: 3.754761E+00 | lm loss PPL: 4.272403E+01 | ------------------------------------------------------------------------------------------------- - iteration 2551/ 292968 | consumed samples: 5224448 | consumed tokens: 681148416 | elapsed time per iteration (ms): 327816.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.789277E+00 | loss scale: 131072.0 | grad norm: 102773.008 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2552/ 292968 | consumed samples: 5226496 | consumed tokens: 681558016 | elapsed time per iteration (ms): 123801.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.791745E+00 | loss scale: 131072.0 | grad norm: 109612.575 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2553/ 292968 | consumed samples: 5228544 | consumed tokens: 681967616 | elapsed time per iteration (ms): 122139.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.773279E+00 | loss scale: 131072.0 | grad norm: 104541.342 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2554/ 292968 | consumed samples: 5230592 | consumed tokens: 682377216 | elapsed time per iteration (ms): 120670.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.758345E+00 | loss scale: 131072.0 | grad norm: 95588.450 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2555/ 292968 | consumed samples: 5232640 | consumed tokens: 682786816 | elapsed time per iteration (ms): 119368.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.734764E+00 | loss scale: 131072.0 | grad norm: 63212.346 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2556/ 292968 | consumed samples: 5234688 | consumed tokens: 683196416 | elapsed time per iteration (ms): 114182.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.728014E+00 | loss scale: 131072.0 | grad norm: 52200.615 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2557/ 292968 | consumed samples: 5236736 | consumed tokens: 683606016 | elapsed time per iteration (ms): 118260.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.735076E+00 | loss scale: 131072.0 | grad norm: 63669.330 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2558/ 292968 | consumed samples: 5238784 | consumed tokens: 684015616 | elapsed time per iteration (ms): 120117.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.678517E+00 | loss scale: 131072.0 | grad norm: 55109.504 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2559/ 292968 | consumed samples: 5240832 | consumed tokens: 684425216 | elapsed time per iteration (ms): 123988.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.716733E+00 | loss scale: 131072.0 | grad norm: 37041.985 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2560/ 292968 | consumed samples: 5242880 | consumed tokens: 684834816 | elapsed time per iteration (ms): 117595.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.704454E+00 | loss scale: 131072.0 | grad norm: 42950.094 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2561/ 292968 | consumed samples: 5244928 | consumed tokens: 685244416 | elapsed time per iteration (ms): 113327.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.680289E+00 | loss scale: 131072.0 | grad norm: 43986.618 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2562/ 292968 | consumed samples: 5246976 | consumed tokens: 685654016 | elapsed time per iteration (ms): 114878.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.715766E+00 | loss scale: 131072.0 | grad norm: 40044.162 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2563/ 292968 | consumed samples: 5249024 | consumed tokens: 686063616 | elapsed time per iteration (ms): 115775.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.715712E+00 | loss scale: 131072.0 | grad norm: 39075.622 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2564/ 292968 | consumed samples: 5251072 | consumed tokens: 686473216 | elapsed time per iteration (ms): 116737.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.711129E+00 | loss scale: 131072.0 | grad norm: 38300.643 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2565/ 292968 | consumed samples: 5253120 | consumed tokens: 686882816 | elapsed time per iteration (ms): 112218.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.673662E+00 | loss scale: 131072.0 | grad norm: 43882.677 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2566/ 292968 | consumed samples: 5255168 | consumed tokens: 687292416 | elapsed time per iteration (ms): 112118.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.700151E+00 | loss scale: 131072.0 | grad norm: 44788.039 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2567/ 292968 | consumed samples: 5257216 | consumed tokens: 687702016 | elapsed time per iteration (ms): 111336.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.709495E+00 | loss scale: 131072.0 | grad norm: 49922.595 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2568/ 292968 | consumed samples: 5259264 | consumed tokens: 688111616 | elapsed time per iteration (ms): 113206.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.677535E+00 | loss scale: 131072.0 | grad norm: 51990.654 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2569/ 292968 | consumed samples: 5261312 | consumed tokens: 688521216 | elapsed time per iteration (ms): 115610.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.675712E+00 | loss scale: 131072.0 | grad norm: 43667.960 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2570/ 292968 | consumed samples: 5263360 | consumed tokens: 688930816 | elapsed time per iteration (ms): 109567.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.695056E+00 | loss scale: 131072.0 | grad norm: 35596.969 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2571/ 292968 | consumed samples: 5265408 | consumed tokens: 689340416 | elapsed time per iteration (ms): 108918.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.691220E+00 | loss scale: 131072.0 | grad norm: 40846.960 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2572/ 292968 | consumed samples: 5267456 | consumed tokens: 689750016 | elapsed time per iteration (ms): 109564.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.674608E+00 | loss scale: 131072.0 | grad norm: 34759.061 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2573/ 292968 | consumed samples: 5269504 | consumed tokens: 690159616 | elapsed time per iteration (ms): 109086.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.710107E+00 | loss scale: 131072.0 | grad norm: 40718.934 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2574/ 292968 | consumed samples: 5271552 | consumed tokens: 690569216 | elapsed time per iteration (ms): 112606.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.678643E+00 | loss scale: 131072.0 | grad norm: 39370.588 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2575/ 292968 | consumed samples: 5273600 | consumed tokens: 690978816 | elapsed time per iteration (ms): 110138.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.683617E+00 | loss scale: 131072.0 | grad norm: 39210.369 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2576/ 292968 | consumed samples: 5275648 | consumed tokens: 691388416 | elapsed time per iteration (ms): 110373.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.690017E+00 | loss scale: 131072.0 | grad norm: 39506.338 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2577/ 292968 | consumed samples: 5277696 | consumed tokens: 691798016 | elapsed time per iteration (ms): 109006.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.681340E+00 | loss scale: 131072.0 | grad norm: 39215.505 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2578/ 292968 | consumed samples: 5279744 | consumed tokens: 692207616 | elapsed time per iteration (ms): 108729.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.683989E+00 | loss scale: 131072.0 | grad norm: 43813.734 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2579/ 292968 | consumed samples: 5281792 | consumed tokens: 692617216 | elapsed time per iteration (ms): 110093.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.648261E+00 | loss scale: 131072.0 | grad norm: 41748.514 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2580/ 292968 | consumed samples: 5283840 | consumed tokens: 693026816 | elapsed time per iteration (ms): 109502.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.687841E+00 | loss scale: 131072.0 | grad norm: 55665.116 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2581/ 292968 | consumed samples: 5285888 | consumed tokens: 693436416 | elapsed time per iteration (ms): 112170.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.675946E+00 | loss scale: 131072.0 | grad norm: 42584.645 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2582/ 292968 | consumed samples: 5287936 | consumed tokens: 693846016 | elapsed time per iteration (ms): 109982.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.691612E+00 | loss scale: 131072.0 | grad norm: 34819.767 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2583/ 292968 | consumed samples: 5289984 | consumed tokens: 694255616 | elapsed time per iteration (ms): 111341.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.678954E+00 | loss scale: 131072.0 | grad norm: 34498.343 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2584/ 292968 | consumed samples: 5292032 | consumed tokens: 694665216 | elapsed time per iteration (ms): 109883.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.672000E+00 | loss scale: 131072.0 | grad norm: 40834.422 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2585/ 292968 | consumed samples: 5294080 | consumed tokens: 695074816 | elapsed time per iteration (ms): 112086.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.686760E+00 | loss scale: 131072.0 | grad norm: 43218.635 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2586/ 292968 | consumed samples: 5296128 | consumed tokens: 695484416 | elapsed time per iteration (ms): 110892.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.694087E+00 | loss scale: 131072.0 | grad norm: 37478.755 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2587/ 292968 | consumed samples: 5298176 | consumed tokens: 695894016 | elapsed time per iteration (ms): 111171.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.695205E+00 | loss scale: 131072.0 | grad norm: 47726.596 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2588/ 292968 | consumed samples: 5300224 | consumed tokens: 696303616 | elapsed time per iteration (ms): 109412.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.658865E+00 | loss scale: 131072.0 | grad norm: 38535.029 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2589/ 292968 | consumed samples: 5302272 | consumed tokens: 696713216 | elapsed time per iteration (ms): 111638.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.656738E+00 | loss scale: 131072.0 | grad norm: 39851.394 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2590/ 292968 | consumed samples: 5304320 | consumed tokens: 697122816 | elapsed time per iteration (ms): 109075.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.672368E+00 | loss scale: 131072.0 | grad norm: 42548.292 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2591/ 292968 | consumed samples: 5306368 | consumed tokens: 697532416 | elapsed time per iteration (ms): 110462.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.715667E+00 | loss scale: 131072.0 | grad norm: 56467.360 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2592/ 292968 | consumed samples: 5308416 | consumed tokens: 697942016 | elapsed time per iteration (ms): 108770.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.679235E+00 | loss scale: 131072.0 | grad norm: 56631.566 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2593/ 292968 | consumed samples: 5310464 | consumed tokens: 698351616 | elapsed time per iteration (ms): 111097.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.685216E+00 | loss scale: 131072.0 | grad norm: 45606.299 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2594/ 292968 | consumed samples: 5312512 | consumed tokens: 698761216 | elapsed time per iteration (ms): 107750.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.690898E+00 | loss scale: 131072.0 | grad norm: 44391.074 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2595/ 292968 | consumed samples: 5314560 | consumed tokens: 699170816 | elapsed time per iteration (ms): 105800.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.663409E+00 | loss scale: 131072.0 | grad norm: 40257.192 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2596/ 292968 | consumed samples: 5316608 | consumed tokens: 699580416 | elapsed time per iteration (ms): 106707.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.692679E+00 | loss scale: 131072.0 | grad norm: 38087.811 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2597/ 292968 | consumed samples: 5318656 | consumed tokens: 699990016 | elapsed time per iteration (ms): 106825.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.670416E+00 | loss scale: 131072.0 | grad norm: 39597.893 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2598/ 292968 | consumed samples: 5320704 | consumed tokens: 700399616 | elapsed time per iteration (ms): 109300.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.684358E+00 | loss scale: 131072.0 | grad norm: 50001.753 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2599/ 292968 | consumed samples: 5322752 | consumed tokens: 700809216 | elapsed time per iteration (ms): 106281.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.646162E+00 | loss scale: 131072.0 | grad norm: 53311.558 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2600/ 292968 | consumed samples: 5324800 | consumed tokens: 701218816 | elapsed time per iteration (ms): 109505.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.679504E+00 | loss scale: 131072.0 | grad norm: 29799.357 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2601/ 292968 | consumed samples: 5326848 | consumed tokens: 701628416 | elapsed time per iteration (ms): 108093.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.706317E+00 | loss scale: 131072.0 | grad norm: 44183.122 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2602/ 292968 | consumed samples: 5328896 | consumed tokens: 702038016 | elapsed time per iteration (ms): 108594.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.676766E+00 | loss scale: 131072.0 | grad norm: 39224.038 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2603/ 292968 | consumed samples: 5330944 | consumed tokens: 702447616 | elapsed time per iteration (ms): 110015.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.667875E+00 | loss scale: 131072.0 | grad norm: 35099.932 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2604/ 292968 | consumed samples: 5332992 | consumed tokens: 702857216 | elapsed time per iteration (ms): 108226.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.676596E+00 | loss scale: 131072.0 | grad norm: 44727.879 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2605/ 292968 | consumed samples: 5335040 | consumed tokens: 703266816 | elapsed time per iteration (ms): 106905.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.677900E+00 | loss scale: 131072.0 | grad norm: 53565.059 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2606/ 292968 | consumed samples: 5337088 | consumed tokens: 703676416 | elapsed time per iteration (ms): 107158.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.653572E+00 | loss scale: 131072.0 | grad norm: 65818.264 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2607/ 292968 | consumed samples: 5339136 | consumed tokens: 704086016 | elapsed time per iteration (ms): 109106.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.673477E+00 | loss scale: 131072.0 | grad norm: 71155.905 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2608/ 292968 | consumed samples: 5341184 | consumed tokens: 704495616 | elapsed time per iteration (ms): 107011.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.690985E+00 | loss scale: 131072.0 | grad norm: 51880.991 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2609/ 292968 | consumed samples: 5343232 | consumed tokens: 704905216 | elapsed time per iteration (ms): 108915.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.675349E+00 | loss scale: 131072.0 | grad norm: 36638.301 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2610/ 292968 | consumed samples: 5345280 | consumed tokens: 705314816 | elapsed time per iteration (ms): 107529.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.675055E+00 | loss scale: 131072.0 | grad norm: 42813.123 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2611/ 292968 | consumed samples: 5347328 | consumed tokens: 705724416 | elapsed time per iteration (ms): 106346.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.688864E+00 | loss scale: 131072.0 | grad norm: 61248.380 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2612/ 292968 | consumed samples: 5349376 | consumed tokens: 706134016 | elapsed time per iteration (ms): 108437.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.698993E+00 | loss scale: 131072.0 | grad norm: 76257.536 | num zeros: 0.0 | curriculum seqlen: 200 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2613/ 292968 | consumed samples: 5351424 | consumed tokens: 706560000 | elapsed time per iteration (ms): 109082.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.680395E+00 | loss scale: 131072.0 | grad norm: 58800.977 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2614/ 292968 | consumed samples: 5353472 | consumed tokens: 706985984 | elapsed time per iteration (ms): 109522.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.685363E+00 | loss scale: 131072.0 | grad norm: 47335.007 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2615/ 292968 | consumed samples: 5355520 | consumed tokens: 707411968 | elapsed time per iteration (ms): 108981.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.687247E+00 | loss scale: 131072.0 | grad norm: 44293.984 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2616/ 292968 | consumed samples: 5357568 | consumed tokens: 707837952 | elapsed time per iteration (ms): 112517.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.679443E+00 | loss scale: 131072.0 | grad norm: 57645.457 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2617/ 292968 | consumed samples: 5359616 | consumed tokens: 708263936 | elapsed time per iteration (ms): 112405.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.678071E+00 | loss scale: 131072.0 | grad norm: 50036.602 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2618/ 292968 | consumed samples: 5361664 | consumed tokens: 708689920 | elapsed time per iteration (ms): 120205.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.715105E+00 | loss scale: 131072.0 | grad norm: 38971.553 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2619/ 292968 | consumed samples: 5363712 | consumed tokens: 709115904 | elapsed time per iteration (ms): 113681.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.651428E+00 | loss scale: 131072.0 | grad norm: 50702.001 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2620/ 292968 | consumed samples: 5365760 | consumed tokens: 709541888 | elapsed time per iteration (ms): 108642.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.667502E+00 | loss scale: 131072.0 | grad norm: 54658.735 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2621/ 292968 | consumed samples: 5367808 | consumed tokens: 709967872 | elapsed time per iteration (ms): 113875.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.697449E+00 | loss scale: 131072.0 | grad norm: 51498.771 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2622/ 292968 | consumed samples: 5369856 | consumed tokens: 710393856 | elapsed time per iteration (ms): 113538.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.652537E+00 | loss scale: 131072.0 | grad norm: 51879.399 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2623/ 292968 | consumed samples: 5371904 | consumed tokens: 710819840 | elapsed time per iteration (ms): 108854.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.672163E+00 | loss scale: 131072.0 | grad norm: 59023.644 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2624/ 292968 | consumed samples: 5373952 | consumed tokens: 711245824 | elapsed time per iteration (ms): 115432.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.720203E+00 | loss scale: 131072.0 | grad norm: 55683.047 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2625/ 292968 | consumed samples: 5376000 | consumed tokens: 711671808 | elapsed time per iteration (ms): 116294.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.693449E+00 | loss scale: 131072.0 | grad norm: 46126.156 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2626/ 292968 | consumed samples: 5378048 | consumed tokens: 712097792 | elapsed time per iteration (ms): 110353.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.688527E+00 | loss scale: 131072.0 | grad norm: 47446.929 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2627/ 292968 | consumed samples: 5380096 | consumed tokens: 712523776 | elapsed time per iteration (ms): 113121.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.684380E+00 | loss scale: 131072.0 | grad norm: 55592.052 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2628/ 292968 | consumed samples: 5382144 | consumed tokens: 712949760 | elapsed time per iteration (ms): 109133.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.668486E+00 | loss scale: 131072.0 | grad norm: 51110.379 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2629/ 292968 | consumed samples: 5384192 | consumed tokens: 713375744 | elapsed time per iteration (ms): 116773.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.669429E+00 | loss scale: 131072.0 | grad norm: 40118.092 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2630/ 292968 | consumed samples: 5386240 | consumed tokens: 713801728 | elapsed time per iteration (ms): 113335.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.639462E+00 | loss scale: 131072.0 | grad norm: 45015.278 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2631/ 292968 | consumed samples: 5388288 | consumed tokens: 714227712 | elapsed time per iteration (ms): 110597.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.680613E+00 | loss scale: 131072.0 | grad norm: 48039.048 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2632/ 292968 | consumed samples: 5390336 | consumed tokens: 714653696 | elapsed time per iteration (ms): 113353.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.660079E+00 | loss scale: 131072.0 | grad norm: 61677.633 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2633/ 292968 | consumed samples: 5392384 | consumed tokens: 715079680 | elapsed time per iteration (ms): 110544.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.672939E+00 | loss scale: 131072.0 | grad norm: 63216.291 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2634/ 292968 | consumed samples: 5394432 | consumed tokens: 715505664 | elapsed time per iteration (ms): 112665.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.655371E+00 | loss scale: 131072.0 | grad norm: 43080.119 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2635/ 292968 | consumed samples: 5396480 | consumed tokens: 715931648 | elapsed time per iteration (ms): 111138.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.653244E+00 | loss scale: 131072.0 | grad norm: 38260.029 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2636/ 292968 | consumed samples: 5398528 | consumed tokens: 716357632 | elapsed time per iteration (ms): 112271.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.686737E+00 | loss scale: 131072.0 | grad norm: 37417.416 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2637/ 292968 | consumed samples: 5400576 | consumed tokens: 716783616 | elapsed time per iteration (ms): 112242.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.662534E+00 | loss scale: 131072.0 | grad norm: 39886.248 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2638/ 292968 | consumed samples: 5402624 | consumed tokens: 717209600 | elapsed time per iteration (ms): 110742.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.655050E+00 | loss scale: 131072.0 | grad norm: 45737.629 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2639/ 292968 | consumed samples: 5404672 | consumed tokens: 717635584 | elapsed time per iteration (ms): 108573.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.654103E+00 | loss scale: 131072.0 | grad norm: 53108.940 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2640/ 292968 | consumed samples: 5406720 | consumed tokens: 718061568 | elapsed time per iteration (ms): 108258.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.672544E+00 | loss scale: 131072.0 | grad norm: 50171.231 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2641/ 292968 | consumed samples: 5408768 | consumed tokens: 718487552 | elapsed time per iteration (ms): 110828.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.654342E+00 | loss scale: 131072.0 | grad norm: 57007.130 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2642/ 292968 | consumed samples: 5410816 | consumed tokens: 718913536 | elapsed time per iteration (ms): 113246.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.651388E+00 | loss scale: 131072.0 | grad norm: 43944.895 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2643/ 292968 | consumed samples: 5412864 | consumed tokens: 719339520 | elapsed time per iteration (ms): 110287.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.663248E+00 | loss scale: 131072.0 | grad norm: 43550.783 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2644/ 292968 | consumed samples: 5414912 | consumed tokens: 719765504 | elapsed time per iteration (ms): 113879.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.672125E+00 | loss scale: 131072.0 | grad norm: 48530.869 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2645/ 292968 | consumed samples: 5416960 | consumed tokens: 720191488 | elapsed time per iteration (ms): 112169.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.677576E+00 | loss scale: 131072.0 | grad norm: 36340.904 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2646/ 292968 | consumed samples: 5419008 | consumed tokens: 720617472 | elapsed time per iteration (ms): 109739.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.653100E+00 | loss scale: 131072.0 | grad norm: 38286.000 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2647/ 292968 | consumed samples: 5421056 | consumed tokens: 721043456 | elapsed time per iteration (ms): 111697.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.663363E+00 | loss scale: 131072.0 | grad norm: 43038.265 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2648/ 292968 | consumed samples: 5423104 | consumed tokens: 721469440 | elapsed time per iteration (ms): 115200.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.674177E+00 | loss scale: 131072.0 | grad norm: 46291.165 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2649/ 292968 | consumed samples: 5425152 | consumed tokens: 721895424 | elapsed time per iteration (ms): 121590.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.636573E+00 | loss scale: 131072.0 | grad norm: 39053.217 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2650/ 292968 | consumed samples: 5427200 | consumed tokens: 722321408 | elapsed time per iteration (ms): 109722.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.679028E+00 | loss scale: 131072.0 | grad norm: 42933.248 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2651/ 292968 | consumed samples: 5429248 | consumed tokens: 722747392 | elapsed time per iteration (ms): 112434.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.686113E+00 | loss scale: 131072.0 | grad norm: 64822.890 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2652/ 292968 | consumed samples: 5431296 | consumed tokens: 723173376 | elapsed time per iteration (ms): 111732.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.665164E+00 | loss scale: 131072.0 | grad norm: 74313.114 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2653/ 292968 | consumed samples: 5433344 | consumed tokens: 723599360 | elapsed time per iteration (ms): 112244.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.656505E+00 | loss scale: 131072.0 | grad norm: 66100.053 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2654/ 292968 | consumed samples: 5435392 | consumed tokens: 724025344 | elapsed time per iteration (ms): 110710.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.660270E+00 | loss scale: 131072.0 | grad norm: 52747.982 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2655/ 292968 | consumed samples: 5437440 | consumed tokens: 724451328 | elapsed time per iteration (ms): 113150.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.667228E+00 | loss scale: 131072.0 | grad norm: 42448.984 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2656/ 292968 | consumed samples: 5439488 | consumed tokens: 724877312 | elapsed time per iteration (ms): 109064.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.654269E+00 | loss scale: 131072.0 | grad norm: 41701.347 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2657/ 292968 | consumed samples: 5441536 | consumed tokens: 725303296 | elapsed time per iteration (ms): 111288.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.629042E+00 | loss scale: 131072.0 | grad norm: 43538.210 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2658/ 292968 | consumed samples: 5443584 | consumed tokens: 725729280 | elapsed time per iteration (ms): 112836.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.651093E+00 | loss scale: 131072.0 | grad norm: 37051.837 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2659/ 292968 | consumed samples: 5445632 | consumed tokens: 726155264 | elapsed time per iteration (ms): 111571.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.681063E+00 | loss scale: 131072.0 | grad norm: 35878.107 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2660/ 292968 | consumed samples: 5447680 | consumed tokens: 726581248 | elapsed time per iteration (ms): 110965.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.650982E+00 | loss scale: 131072.0 | grad norm: 36823.821 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2661/ 292968 | consumed samples: 5449728 | consumed tokens: 727007232 | elapsed time per iteration (ms): 110910.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.653976E+00 | loss scale: 131072.0 | grad norm: 34500.592 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2662/ 292968 | consumed samples: 5451776 | consumed tokens: 727433216 | elapsed time per iteration (ms): 108193.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.659220E+00 | loss scale: 131072.0 | grad norm: 37143.982 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2663/ 292968 | consumed samples: 5453824 | consumed tokens: 727859200 | elapsed time per iteration (ms): 110999.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.685313E+00 | loss scale: 131072.0 | grad norm: 56697.812 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2664/ 292968 | consumed samples: 5455872 | consumed tokens: 728285184 | elapsed time per iteration (ms): 109827.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.644733E+00 | loss scale: 131072.0 | grad norm: 66297.483 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2665/ 292968 | consumed samples: 5457920 | consumed tokens: 728711168 | elapsed time per iteration (ms): 108407.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.668042E+00 | loss scale: 131072.0 | grad norm: 42270.949 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2666/ 292968 | consumed samples: 5459968 | consumed tokens: 729137152 | elapsed time per iteration (ms): 108943.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.652422E+00 | loss scale: 131072.0 | grad norm: 48110.338 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2667/ 292968 | consumed samples: 5462016 | consumed tokens: 729563136 | elapsed time per iteration (ms): 108641.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.647676E+00 | loss scale: 131072.0 | grad norm: 47635.967 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2668/ 292968 | consumed samples: 5464064 | consumed tokens: 729989120 | elapsed time per iteration (ms): 108539.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.631294E+00 | loss scale: 131072.0 | grad norm: 41355.276 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2669/ 292968 | consumed samples: 5466112 | consumed tokens: 730415104 | elapsed time per iteration (ms): 107360.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.655333E+00 | loss scale: 131072.0 | grad norm: 49357.057 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2670/ 292968 | consumed samples: 5468160 | consumed tokens: 730841088 | elapsed time per iteration (ms): 107646.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.676918E+00 | loss scale: 131072.0 | grad norm: 60634.719 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2671/ 292968 | consumed samples: 5470208 | consumed tokens: 731267072 | elapsed time per iteration (ms): 107071.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.645433E+00 | loss scale: 131072.0 | grad norm: 70261.407 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2672/ 292968 | consumed samples: 5472256 | consumed tokens: 731693056 | elapsed time per iteration (ms): 107942.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.700987E+00 | loss scale: 131072.0 | grad norm: 53242.323 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2673/ 292968 | consumed samples: 5474304 | consumed tokens: 732119040 | elapsed time per iteration (ms): 107388.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.647931E+00 | loss scale: 131072.0 | grad norm: 47783.464 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2674/ 292968 | consumed samples: 5476352 | consumed tokens: 732545024 | elapsed time per iteration (ms): 108542.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.645255E+00 | loss scale: 131072.0 | grad norm: 41243.860 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2675/ 292968 | consumed samples: 5478400 | consumed tokens: 732971008 | elapsed time per iteration (ms): 107393.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.675579E+00 | loss scale: 131072.0 | grad norm: 36936.971 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2676/ 292968 | consumed samples: 5480448 | consumed tokens: 733396992 | elapsed time per iteration (ms): 108016.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.626407E+00 | loss scale: 131072.0 | grad norm: 35941.512 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2677/ 292968 | consumed samples: 5482496 | consumed tokens: 733822976 | elapsed time per iteration (ms): 109096.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.676696E+00 | loss scale: 131072.0 | grad norm: 43041.655 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2678/ 292968 | consumed samples: 5484544 | consumed tokens: 734248960 | elapsed time per iteration (ms): 109650.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.637198E+00 | loss scale: 131072.0 | grad norm: 46017.365 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2679/ 292968 | consumed samples: 5486592 | consumed tokens: 734674944 | elapsed time per iteration (ms): 106702.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.639631E+00 | loss scale: 131072.0 | grad norm: 50281.315 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2680/ 292968 | consumed samples: 5488640 | consumed tokens: 735100928 | elapsed time per iteration (ms): 106572.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.649009E+00 | loss scale: 131072.0 | grad norm: 69469.743 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2681/ 292968 | consumed samples: 5490688 | consumed tokens: 735526912 | elapsed time per iteration (ms): 106714.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.631393E+00 | loss scale: 131072.0 | grad norm: 77486.932 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2682/ 292968 | consumed samples: 5492736 | consumed tokens: 735952896 | elapsed time per iteration (ms): 107682.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.635889E+00 | loss scale: 131072.0 | grad norm: 60834.944 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2683/ 292968 | consumed samples: 5494784 | consumed tokens: 736378880 | elapsed time per iteration (ms): 109349.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.623968E+00 | loss scale: 131072.0 | grad norm: 39041.263 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2684/ 292968 | consumed samples: 5496832 | consumed tokens: 736804864 | elapsed time per iteration (ms): 108962.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.660020E+00 | loss scale: 131072.0 | grad norm: 47257.454 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2685/ 292968 | consumed samples: 5498880 | consumed tokens: 737230848 | elapsed time per iteration (ms): 109078.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.668654E+00 | loss scale: 131072.0 | grad norm: 58651.089 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2686/ 292968 | consumed samples: 5500928 | consumed tokens: 737656832 | elapsed time per iteration (ms): 107479.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.637446E+00 | loss scale: 131072.0 | grad norm: 53733.500 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2687/ 292968 | consumed samples: 5502976 | consumed tokens: 738082816 | elapsed time per iteration (ms): 107172.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.664834E+00 | loss scale: 131072.0 | grad norm: 46750.164 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2688/ 292968 | consumed samples: 5505024 | consumed tokens: 738508800 | elapsed time per iteration (ms): 108588.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.665414E+00 | loss scale: 131072.0 | grad norm: 45644.654 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2689/ 292968 | consumed samples: 5507072 | consumed tokens: 738934784 | elapsed time per iteration (ms): 109622.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.668394E+00 | loss scale: 131072.0 | grad norm: 50782.637 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2690/ 292968 | consumed samples: 5509120 | consumed tokens: 739360768 | elapsed time per iteration (ms): 107621.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.663202E+00 | loss scale: 131072.0 | grad norm: 52166.426 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2691/ 292968 | consumed samples: 5511168 | consumed tokens: 739786752 | elapsed time per iteration (ms): 107684.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.669454E+00 | loss scale: 131072.0 | grad norm: 47945.866 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2692/ 292968 | consumed samples: 5513216 | consumed tokens: 740212736 | elapsed time per iteration (ms): 107481.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.646496E+00 | loss scale: 131072.0 | grad norm: 55706.897 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2693/ 292968 | consumed samples: 5515264 | consumed tokens: 740638720 | elapsed time per iteration (ms): 106229.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.669711E+00 | loss scale: 131072.0 | grad norm: 61546.656 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2694/ 292968 | consumed samples: 5517312 | consumed tokens: 741064704 | elapsed time per iteration (ms): 108069.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.638564E+00 | loss scale: 131072.0 | grad norm: 47969.327 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2695/ 292968 | consumed samples: 5519360 | consumed tokens: 741490688 | elapsed time per iteration (ms): 107661.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.636034E+00 | loss scale: 131072.0 | grad norm: 52002.889 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2696/ 292968 | consumed samples: 5521408 | consumed tokens: 741916672 | elapsed time per iteration (ms): 108355.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.679150E+00 | loss scale: 131072.0 | grad norm: 46128.153 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2697/ 292968 | consumed samples: 5523456 | consumed tokens: 742342656 | elapsed time per iteration (ms): 108205.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.645112E+00 | loss scale: 131072.0 | grad norm: 40706.511 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2698/ 292968 | consumed samples: 5525504 | consumed tokens: 742768640 | elapsed time per iteration (ms): 107611.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.668297E+00 | loss scale: 131072.0 | grad norm: 42011.629 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2699/ 292968 | consumed samples: 5527552 | consumed tokens: 743194624 | elapsed time per iteration (ms): 105736.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.624739E+00 | loss scale: 131072.0 | grad norm: 40611.683 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2700/ 292968 | consumed samples: 5529600 | consumed tokens: 743620608 | elapsed time per iteration (ms): 108668.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.648071E+00 | loss scale: 131072.0 | grad norm: 49795.671 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 2700 | lm loss value: 3.622391E+00 | lm loss PPL: 3.742693E+01 | ------------------------------------------------------------------------------------------------- -saving checkpoint at iteration 2700 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-29 18:42:08,625] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/mp_rank_01_model_states.pt -[2021-10-29 18:42:08,737] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/mp_rank_00_model_states.pt -[2021-10-29 18:42:22,672] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-29 18:42:22,673] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-29 18:42:22,712] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-29 18:42:22,719] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-29 18:42:22,733] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-29 18:42:22,768] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-29 18:42:22,811] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-29 18:42:22,813] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-29 18:42:22,813] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-29 18:42:22,826] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-29 18:42:22,838] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-29 18:42:22,868] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-29 18:42:22,882] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-29 18:42:22,892] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-29 18:42:22,946] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-29 18:42:22,948] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-29 18:42:22,958] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-29 18:42:22,989] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-29 18:42:22,991] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-29 18:42:23,012] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-29 18:42:23,047] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-29 18:42:23,077] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-29 18:42:23,085] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-29 18:42:23,103] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-29 18:42:23,139] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-29 18:42:23,152] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-29 18:42:23,187] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-29 18:42:23,196] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-29 18:42:23,209] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-29 18:42:23,210] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-29 18:42:23,222] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-29 18:42:23,244] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-29 18:42:23,264] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-29 18:42:23,272] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-29 18:42:23,278] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-29 18:42:23,324] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-29 18:42:23,326] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-29 18:42:23,360] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-29 18:42:23,407] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-29 18:42:23,439] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-29 18:42:23,483] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-29 18:42:23,492] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-29 18:42:23,516] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-29 18:42:23,574] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-29 18:42:23,629] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-29 18:42:23,725] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-29 18:42:23,751] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-29 18:42:23,831] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-29 18:42:23,844] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-29 18:42:23,846] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-29 18:42:23,858] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-29 18:42:23,865] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-29 18:42:23,879] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-29 18:42:23,880] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-29 18:42:23,893] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-29 18:42:23,907] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-29 18:42:23,911] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-29 18:42:23,912] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-29 18:42:23,919] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-29 18:42:23,937] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-29 18:42:23,961] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-29 18:42:23,969] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-29 18:42:23,972] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-29 18:42:23,973] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-29 18:42:23,977] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-29 18:42:24,012] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-29 18:42:24,042] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-29 18:42:24,043] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-29 18:42:24,045] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-29 18:42:24,053] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-29 18:42:24,072] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-29 18:42:24,118] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-29 18:42:24,121] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-29 18:42:24,148] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-29 18:42:24,176] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-29 18:42:24,182] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-29 18:42:24,218] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-29 18:42:24,223] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-29 18:42:24,245] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-29 18:42:24,255] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-29 18:42:24,282] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-29 18:42:24,315] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-29 18:42:24,315] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-29 18:42:24,317] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-29 18:42:24,348] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-29 18:42:24,354] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-29 18:42:24,372] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-29 18:42:24,379] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-29 18:42:24,379] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-29 18:42:24,388] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-29 18:42:24,403] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-29 18:42:24,419] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-29 18:42:24,428] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-29 18:42:24,552] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-29 18:42:24,694] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-29 18:42:25,125] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-29 18:42:25,127] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-29 18:42:26,110] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-29 18:42:26,304] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-29 18:42:26,787] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-29 18:42:26,971] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-29 18:42:27,010] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-29 18:42:27,199] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-29 18:42:28,072] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-29 18:42:28,729] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-29 18:42:28,840] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-29 18:42:29,597] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-29 18:42:29,896] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-29 18:42:30,215] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-29 18:42:30,501] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-29 18:42:30,657] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-29 18:42:31,554] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-29 18:42:31,659] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-29 18:42:31,679] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-29 18:42:31,834] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-29 18:42:32,351] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-29 18:42:32,402] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-29 18:42:32,794] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-29 18:42:33,389] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-29 18:42:33,593] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-29 18:42:33,705] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-29 18:42:33,732] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-29 18:42:34,659] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-29 18:42:36,685] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-29 18:42:36,889] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-29 18:42:40,343] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-29 18:42:41,510] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-29 18:42:43,484] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step2700/zero_pp_rank_0_mp_rank_124_optim_states.pt - successfully saved checkpoint at iteration 2700 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 37784.02 - iteration 2701/ 292968 | consumed samples: 5531648 | consumed tokens: 744046592 | elapsed time per iteration (ms): 288626.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.639951E+00 | loss scale: 131072.0 | grad norm: 43742.504 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2702/ 292968 | consumed samples: 5533696 | consumed tokens: 744472576 | elapsed time per iteration (ms): 108874.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.616026E+00 | loss scale: 131072.0 | grad norm: 35208.370 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2703/ 292968 | consumed samples: 5535744 | consumed tokens: 744898560 | elapsed time per iteration (ms): 108088.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.660046E+00 | loss scale: 131072.0 | grad norm: 56696.185 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2704/ 292968 | consumed samples: 5537792 | consumed tokens: 745324544 | elapsed time per iteration (ms): 108393.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.623339E+00 | loss scale: 131072.0 | grad norm: 44899.447 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2705/ 292968 | consumed samples: 5539840 | consumed tokens: 745750528 | elapsed time per iteration (ms): 109781.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.634214E+00 | loss scale: 131072.0 | grad norm: 40552.495 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2706/ 292968 | consumed samples: 5541888 | consumed tokens: 746176512 | elapsed time per iteration (ms): 106870.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.663705E+00 | loss scale: 131072.0 | grad norm: 53878.811 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2707/ 292968 | consumed samples: 5543936 | consumed tokens: 746602496 | elapsed time per iteration (ms): 108714.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.669910E+00 | loss scale: 131072.0 | grad norm: 49099.815 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2708/ 292968 | consumed samples: 5545984 | consumed tokens: 747028480 | elapsed time per iteration (ms): 111768.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.634970E+00 | loss scale: 131072.0 | grad norm: 48750.961 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2709/ 292968 | consumed samples: 5548032 | consumed tokens: 747454464 | elapsed time per iteration (ms): 108620.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.669893E+00 | loss scale: 131072.0 | grad norm: 52716.283 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2710/ 292968 | consumed samples: 5550080 | consumed tokens: 747880448 | elapsed time per iteration (ms): 109197.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.667222E+00 | loss scale: 131072.0 | grad norm: 48990.831 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2711/ 292968 | consumed samples: 5552128 | consumed tokens: 748306432 | elapsed time per iteration (ms): 106589.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.663246E+00 | loss scale: 131072.0 | grad norm: 43851.583 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2712/ 292968 | consumed samples: 5554176 | consumed tokens: 748732416 | elapsed time per iteration (ms): 109260.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.668079E+00 | loss scale: 131072.0 | grad norm: 43290.031 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2713/ 292968 | consumed samples: 5556224 | consumed tokens: 749158400 | elapsed time per iteration (ms): 107601.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.636516E+00 | loss scale: 131072.0 | grad norm: 44117.311 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2714/ 292968 | consumed samples: 5558272 | consumed tokens: 749584384 | elapsed time per iteration (ms): 111941.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.613059E+00 | loss scale: 131072.0 | grad norm: 48753.729 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2715/ 292968 | consumed samples: 5560320 | consumed tokens: 750010368 | elapsed time per iteration (ms): 107706.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.643236E+00 | loss scale: 131072.0 | grad norm: 47320.785 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2716/ 292968 | consumed samples: 5562368 | consumed tokens: 750436352 | elapsed time per iteration (ms): 107503.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.638736E+00 | loss scale: 131072.0 | grad norm: 39553.326 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2717/ 292968 | consumed samples: 5564416 | consumed tokens: 750862336 | elapsed time per iteration (ms): 111420.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.663488E+00 | loss scale: 131072.0 | grad norm: 37008.641 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2718/ 292968 | consumed samples: 5566464 | consumed tokens: 751288320 | elapsed time per iteration (ms): 108725.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.643597E+00 | loss scale: 131072.0 | grad norm: 37501.232 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2719/ 292968 | consumed samples: 5568512 | consumed tokens: 751714304 | elapsed time per iteration (ms): 108092.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.666336E+00 | loss scale: 131072.0 | grad norm: 37291.408 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2720/ 292968 | consumed samples: 5570560 | consumed tokens: 752140288 | elapsed time per iteration (ms): 106989.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.645058E+00 | loss scale: 131072.0 | grad norm: 35818.718 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2721/ 292968 | consumed samples: 5572608 | consumed tokens: 752566272 | elapsed time per iteration (ms): 107020.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.663189E+00 | loss scale: 131072.0 | grad norm: 42765.914 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2722/ 292968 | consumed samples: 5574656 | consumed tokens: 752992256 | elapsed time per iteration (ms): 107592.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.626324E+00 | loss scale: 131072.0 | grad norm: 42096.709 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2723/ 292968 | consumed samples: 5576704 | consumed tokens: 753418240 | elapsed time per iteration (ms): 107159.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.620199E+00 | loss scale: 131072.0 | grad norm: 42858.669 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2724/ 292968 | consumed samples: 5578752 | consumed tokens: 753844224 | elapsed time per iteration (ms): 108285.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.636538E+00 | loss scale: 131072.0 | grad norm: 52267.890 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2725/ 292968 | consumed samples: 5580800 | consumed tokens: 754270208 | elapsed time per iteration (ms): 107016.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.635979E+00 | loss scale: 131072.0 | grad norm: 62252.045 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2726/ 292968 | consumed samples: 5582848 | consumed tokens: 754696192 | elapsed time per iteration (ms): 108011.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.621034E+00 | loss scale: 131072.0 | grad norm: 60358.952 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2727/ 292968 | consumed samples: 5584896 | consumed tokens: 755122176 | elapsed time per iteration (ms): 109635.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.652258E+00 | loss scale: 131072.0 | grad norm: 47172.802 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2728/ 292968 | consumed samples: 5586944 | consumed tokens: 755548160 | elapsed time per iteration (ms): 107384.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.627451E+00 | loss scale: 131072.0 | grad norm: 36331.178 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2729/ 292968 | consumed samples: 5588992 | consumed tokens: 755974144 | elapsed time per iteration (ms): 107474.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.640388E+00 | loss scale: 131072.0 | grad norm: 56245.709 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2730/ 292968 | consumed samples: 5591040 | consumed tokens: 756400128 | elapsed time per iteration (ms): 106534.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.628688E+00 | loss scale: 131072.0 | grad norm: 50462.183 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2731/ 292968 | consumed samples: 5593088 | consumed tokens: 756826112 | elapsed time per iteration (ms): 109099.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.626641E+00 | loss scale: 131072.0 | grad norm: 45554.469 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2732/ 292968 | consumed samples: 5595136 | consumed tokens: 757252096 | elapsed time per iteration (ms): 106975.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.633417E+00 | loss scale: 131072.0 | grad norm: 43118.911 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2733/ 292968 | consumed samples: 5597184 | consumed tokens: 757678080 | elapsed time per iteration (ms): 107732.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.629378E+00 | loss scale: 131072.0 | grad norm: 47643.289 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2734/ 292968 | consumed samples: 5599232 | consumed tokens: 758104064 | elapsed time per iteration (ms): 107551.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.630686E+00 | loss scale: 131072.0 | grad norm: 55684.911 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2735/ 292968 | consumed samples: 5601280 | consumed tokens: 758530048 | elapsed time per iteration (ms): 109141.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.644775E+00 | loss scale: 131072.0 | grad norm: 46671.094 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2736/ 292968 | consumed samples: 5603328 | consumed tokens: 758956032 | elapsed time per iteration (ms): 106757.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.641567E+00 | loss scale: 131072.0 | grad norm: 38768.949 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2737/ 292968 | consumed samples: 5605376 | consumed tokens: 759382016 | elapsed time per iteration (ms): 106512.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.640607E+00 | loss scale: 131072.0 | grad norm: 45939.340 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2738/ 292968 | consumed samples: 5607424 | consumed tokens: 759808000 | elapsed time per iteration (ms): 108703.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.652962E+00 | loss scale: 131072.0 | grad norm: 37337.453 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2739/ 292968 | consumed samples: 5609472 | consumed tokens: 760233984 | elapsed time per iteration (ms): 107105.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.645970E+00 | loss scale: 131072.0 | grad norm: 39313.933 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2740/ 292968 | consumed samples: 5611520 | consumed tokens: 760659968 | elapsed time per iteration (ms): 109296.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.617556E+00 | loss scale: 131072.0 | grad norm: 40998.755 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2741/ 292968 | consumed samples: 5613568 | consumed tokens: 761085952 | elapsed time per iteration (ms): 107155.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.616538E+00 | loss scale: 131072.0 | grad norm: 40233.273 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2742/ 292968 | consumed samples: 5615616 | consumed tokens: 761511936 | elapsed time per iteration (ms): 110628.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.618248E+00 | loss scale: 131072.0 | grad norm: 41524.487 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2743/ 292968 | consumed samples: 5617664 | consumed tokens: 761937920 | elapsed time per iteration (ms): 107577.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.648963E+00 | loss scale: 131072.0 | grad norm: 45252.923 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2744/ 292968 | consumed samples: 5619712 | consumed tokens: 762363904 | elapsed time per iteration (ms): 106859.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.646383E+00 | loss scale: 131072.0 | grad norm: 39849.071 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2745/ 292968 | consumed samples: 5621760 | consumed tokens: 762789888 | elapsed time per iteration (ms): 107809.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.637014E+00 | loss scale: 131072.0 | grad norm: 44579.149 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2746/ 292968 | consumed samples: 5623808 | consumed tokens: 763215872 | elapsed time per iteration (ms): 107318.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.628282E+00 | loss scale: 131072.0 | grad norm: 43330.028 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2747/ 292968 | consumed samples: 5625856 | consumed tokens: 763641856 | elapsed time per iteration (ms): 106721.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.642391E+00 | loss scale: 131072.0 | grad norm: 55604.565 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2748/ 292968 | consumed samples: 5627904 | consumed tokens: 764067840 | elapsed time per iteration (ms): 107239.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.651043E+00 | loss scale: 131072.0 | grad norm: 73235.043 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2749/ 292968 | consumed samples: 5629952 | consumed tokens: 764493824 | elapsed time per iteration (ms): 108344.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.647343E+00 | loss scale: 131072.0 | grad norm: 42625.027 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2750/ 292968 | consumed samples: 5632000 | consumed tokens: 764919808 | elapsed time per iteration (ms): 107243.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.647093E+00 | loss scale: 131072.0 | grad norm: 46545.871 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2751/ 292968 | consumed samples: 5634048 | consumed tokens: 765345792 | elapsed time per iteration (ms): 108039.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.621171E+00 | loss scale: 131072.0 | grad norm: 47890.201 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2752/ 292968 | consumed samples: 5636096 | consumed tokens: 765771776 | elapsed time per iteration (ms): 108492.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.642236E+00 | loss scale: 131072.0 | grad norm: 50479.753 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2753/ 292968 | consumed samples: 5638144 | consumed tokens: 766197760 | elapsed time per iteration (ms): 106986.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.642980E+00 | loss scale: 131072.0 | grad norm: 53144.847 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2754/ 292968 | consumed samples: 5640192 | consumed tokens: 766623744 | elapsed time per iteration (ms): 106692.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.672484E+00 | loss scale: 131072.0 | grad norm: 55138.382 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2755/ 292968 | consumed samples: 5642240 | consumed tokens: 767049728 | elapsed time per iteration (ms): 108170.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.633279E+00 | loss scale: 131072.0 | grad norm: 49120.143 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2756/ 292968 | consumed samples: 5644288 | consumed tokens: 767475712 | elapsed time per iteration (ms): 106331.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.616369E+00 | loss scale: 131072.0 | grad norm: 47670.804 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2757/ 292968 | consumed samples: 5646336 | consumed tokens: 767901696 | elapsed time per iteration (ms): 106634.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.640013E+00 | loss scale: 131072.0 | grad norm: 43088.507 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2758/ 292968 | consumed samples: 5648384 | consumed tokens: 768327680 | elapsed time per iteration (ms): 105981.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.649504E+00 | loss scale: 131072.0 | grad norm: 50347.889 | num zeros: 0.0 | curriculum seqlen: 208 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2759/ 292968 | consumed samples: 5650432 | consumed tokens: 768770048 | elapsed time per iteration (ms): 107046.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.653844E+00 | loss scale: 131072.0 | grad norm: 60311.326 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2760/ 292968 | consumed samples: 5652480 | consumed tokens: 769212416 | elapsed time per iteration (ms): 106889.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.647440E+00 | loss scale: 131072.0 | grad norm: 44218.372 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2761/ 292968 | consumed samples: 5654528 | consumed tokens: 769654784 | elapsed time per iteration (ms): 107438.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.622699E+00 | loss scale: 131072.0 | grad norm: 38510.345 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2762/ 292968 | consumed samples: 5656576 | consumed tokens: 770097152 | elapsed time per iteration (ms): 109170.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.626539E+00 | loss scale: 131072.0 | grad norm: 41537.999 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2763/ 292968 | consumed samples: 5658624 | consumed tokens: 770539520 | elapsed time per iteration (ms): 106290.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.620636E+00 | loss scale: 131072.0 | grad norm: 38338.730 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2764/ 292968 | consumed samples: 5660672 | consumed tokens: 770981888 | elapsed time per iteration (ms): 108604.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.645032E+00 | loss scale: 131072.0 | grad norm: 47035.184 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2765/ 292968 | consumed samples: 5662720 | consumed tokens: 771424256 | elapsed time per iteration (ms): 106883.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.621568E+00 | loss scale: 131072.0 | grad norm: 46089.520 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2766/ 292968 | consumed samples: 5664768 | consumed tokens: 771866624 | elapsed time per iteration (ms): 109014.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.642921E+00 | loss scale: 131072.0 | grad norm: 38888.916 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2767/ 292968 | consumed samples: 5666816 | consumed tokens: 772308992 | elapsed time per iteration (ms): 115081.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.640292E+00 | loss scale: 131072.0 | grad norm: 54349.402 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2768/ 292968 | consumed samples: 5668864 | consumed tokens: 772751360 | elapsed time per iteration (ms): 107564.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.632621E+00 | loss scale: 131072.0 | grad norm: 71167.266 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2769/ 292968 | consumed samples: 5670912 | consumed tokens: 773193728 | elapsed time per iteration (ms): 106275.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.616520E+00 | loss scale: 131072.0 | grad norm: 45766.634 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2770/ 292968 | consumed samples: 5672960 | consumed tokens: 773636096 | elapsed time per iteration (ms): 106937.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.644760E+00 | loss scale: 131072.0 | grad norm: 44833.458 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2771/ 292968 | consumed samples: 5675008 | consumed tokens: 774078464 | elapsed time per iteration (ms): 106666.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.649150E+00 | loss scale: 131072.0 | grad norm: 48352.421 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2772/ 292968 | consumed samples: 5677056 | consumed tokens: 774520832 | elapsed time per iteration (ms): 107224.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.632775E+00 | loss scale: 131072.0 | grad norm: 41286.571 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2773/ 292968 | consumed samples: 5679104 | consumed tokens: 774963200 | elapsed time per iteration (ms): 108570.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.625538E+00 | loss scale: 131072.0 | grad norm: 39947.113 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2774/ 292968 | consumed samples: 5681152 | consumed tokens: 775405568 | elapsed time per iteration (ms): 107425.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.642553E+00 | loss scale: 131072.0 | grad norm: 45947.934 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2775/ 292968 | consumed samples: 5683200 | consumed tokens: 775847936 | elapsed time per iteration (ms): 108422.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.611583E+00 | loss scale: 131072.0 | grad norm: 49188.590 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2776/ 292968 | consumed samples: 5685248 | consumed tokens: 776290304 | elapsed time per iteration (ms): 106321.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.624473E+00 | loss scale: 131072.0 | grad norm: 45472.938 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2777/ 292968 | consumed samples: 5687296 | consumed tokens: 776732672 | elapsed time per iteration (ms): 107721.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.611950E+00 | loss scale: 131072.0 | grad norm: 38562.850 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2778/ 292968 | consumed samples: 5689344 | consumed tokens: 777175040 | elapsed time per iteration (ms): 107658.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.613585E+00 | loss scale: 131072.0 | grad norm: 46004.111 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2779/ 292968 | consumed samples: 5691392 | consumed tokens: 777617408 | elapsed time per iteration (ms): 106148.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.633590E+00 | loss scale: 131072.0 | grad norm: 50262.580 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2780/ 292968 | consumed samples: 5693440 | consumed tokens: 778059776 | elapsed time per iteration (ms): 109242.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.617458E+00 | loss scale: 131072.0 | grad norm: 46993.373 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2781/ 292968 | consumed samples: 5695488 | consumed tokens: 778502144 | elapsed time per iteration (ms): 108542.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.619708E+00 | loss scale: 131072.0 | grad norm: 40592.094 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2782/ 292968 | consumed samples: 5697536 | consumed tokens: 778944512 | elapsed time per iteration (ms): 109180.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.599411E+00 | loss scale: 131072.0 | grad norm: 41274.000 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2783/ 292968 | consumed samples: 5699584 | consumed tokens: 779386880 | elapsed time per iteration (ms): 108797.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.640697E+00 | loss scale: 131072.0 | grad norm: 41347.924 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2784/ 292968 | consumed samples: 5701632 | consumed tokens: 779829248 | elapsed time per iteration (ms): 108076.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.614334E+00 | loss scale: 131072.0 | grad norm: 48965.230 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2785/ 292968 | consumed samples: 5703680 | consumed tokens: 780271616 | elapsed time per iteration (ms): 108604.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.620359E+00 | loss scale: 131072.0 | grad norm: 45782.734 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2786/ 292968 | consumed samples: 5705728 | consumed tokens: 780713984 | elapsed time per iteration (ms): 109398.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.597242E+00 | loss scale: 131072.0 | grad norm: 41604.963 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2787/ 292968 | consumed samples: 5707776 | consumed tokens: 781156352 | elapsed time per iteration (ms): 109714.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.621368E+00 | loss scale: 131072.0 | grad norm: 49448.659 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2788/ 292968 | consumed samples: 5709824 | consumed tokens: 781598720 | elapsed time per iteration (ms): 109359.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.615190E+00 | loss scale: 131072.0 | grad norm: 56941.062 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2789/ 292968 | consumed samples: 5711872 | consumed tokens: 782041088 | elapsed time per iteration (ms): 108118.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.630431E+00 | loss scale: 131072.0 | grad norm: 70123.427 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2790/ 292968 | consumed samples: 5713920 | consumed tokens: 782483456 | elapsed time per iteration (ms): 107873.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.620925E+00 | loss scale: 131072.0 | grad norm: 64742.181 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2791/ 292968 | consumed samples: 5715968 | consumed tokens: 782925824 | elapsed time per iteration (ms): 107810.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.602437E+00 | loss scale: 131072.0 | grad norm: 59346.184 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2792/ 292968 | consumed samples: 5718016 | consumed tokens: 783368192 | elapsed time per iteration (ms): 105923.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.623143E+00 | loss scale: 131072.0 | grad norm: 42795.020 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2793/ 292968 | consumed samples: 5720064 | consumed tokens: 783810560 | elapsed time per iteration (ms): 108015.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.617936E+00 | loss scale: 131072.0 | grad norm: 43051.246 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2794/ 292968 | consumed samples: 5722112 | consumed tokens: 784252928 | elapsed time per iteration (ms): 106552.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.626259E+00 | loss scale: 131072.0 | grad norm: 41089.656 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2795/ 292968 | consumed samples: 5724160 | consumed tokens: 784695296 | elapsed time per iteration (ms): 107653.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.605443E+00 | loss scale: 131072.0 | grad norm: 32098.714 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2796/ 292968 | consumed samples: 5726208 | consumed tokens: 785137664 | elapsed time per iteration (ms): 106744.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.595196E+00 | loss scale: 131072.0 | grad norm: 43327.662 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2797/ 292968 | consumed samples: 5728256 | consumed tokens: 785580032 | elapsed time per iteration (ms): 107511.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.604870E+00 | loss scale: 131072.0 | grad norm: 44258.925 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2798/ 292968 | consumed samples: 5730304 | consumed tokens: 786022400 | elapsed time per iteration (ms): 106899.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.630413E+00 | loss scale: 131072.0 | grad norm: 42367.474 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2799/ 292968 | consumed samples: 5732352 | consumed tokens: 786464768 | elapsed time per iteration (ms): 107740.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.600448E+00 | loss scale: 131072.0 | grad norm: 43067.600 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2800/ 292968 | consumed samples: 5734400 | consumed tokens: 786907136 | elapsed time per iteration (ms): 107870.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.626150E+00 | loss scale: 131072.0 | grad norm: 49347.453 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2801/ 292968 | consumed samples: 5736448 | consumed tokens: 787349504 | elapsed time per iteration (ms): 108030.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.625966E+00 | loss scale: 131072.0 | grad norm: 65241.140 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2802/ 292968 | consumed samples: 5738496 | consumed tokens: 787791872 | elapsed time per iteration (ms): 106601.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.641464E+00 | loss scale: 131072.0 | grad norm: 51626.598 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2803/ 292968 | consumed samples: 5740544 | consumed tokens: 788234240 | elapsed time per iteration (ms): 106805.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.603205E+00 | loss scale: 131072.0 | grad norm: 44358.544 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2804/ 292968 | consumed samples: 5742592 | consumed tokens: 788676608 | elapsed time per iteration (ms): 107300.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.630790E+00 | loss scale: 131072.0 | grad norm: 55262.963 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2805/ 292968 | consumed samples: 5744640 | consumed tokens: 789118976 | elapsed time per iteration (ms): 108073.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.632464E+00 | loss scale: 131072.0 | grad norm: 43756.414 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2806/ 292968 | consumed samples: 5746688 | consumed tokens: 789561344 | elapsed time per iteration (ms): 108070.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.605131E+00 | loss scale: 131072.0 | grad norm: 44130.297 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2807/ 292968 | consumed samples: 5748736 | consumed tokens: 790003712 | elapsed time per iteration (ms): 109077.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.621415E+00 | loss scale: 131072.0 | grad norm: 55303.159 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2808/ 292968 | consumed samples: 5750784 | consumed tokens: 790446080 | elapsed time per iteration (ms): 107247.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.641723E+00 | loss scale: 131072.0 | grad norm: 45153.202 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2809/ 292968 | consumed samples: 5752832 | consumed tokens: 790888448 | elapsed time per iteration (ms): 107223.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.595882E+00 | loss scale: 131072.0 | grad norm: 38979.104 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2810/ 292968 | consumed samples: 5754880 | consumed tokens: 791330816 | elapsed time per iteration (ms): 106901.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.607692E+00 | loss scale: 131072.0 | grad norm: 43136.436 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2811/ 292968 | consumed samples: 5756928 | consumed tokens: 791773184 | elapsed time per iteration (ms): 107865.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.609878E+00 | loss scale: 131072.0 | grad norm: 51127.238 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2812/ 292968 | consumed samples: 5758976 | consumed tokens: 792215552 | elapsed time per iteration (ms): 107058.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.604221E+00 | loss scale: 131072.0 | grad norm: 59971.860 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2813/ 292968 | consumed samples: 5761024 | consumed tokens: 792657920 | elapsed time per iteration (ms): 106734.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.601150E+00 | loss scale: 131072.0 | grad norm: 59839.758 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2814/ 292968 | consumed samples: 5763072 | consumed tokens: 793100288 | elapsed time per iteration (ms): 107294.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.609459E+00 | loss scale: 131072.0 | grad norm: 44272.464 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2815/ 292968 | consumed samples: 5765120 | consumed tokens: 793542656 | elapsed time per iteration (ms): 107589.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.598207E+00 | loss scale: 131072.0 | grad norm: 34374.961 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2816/ 292968 | consumed samples: 5767168 | consumed tokens: 793985024 | elapsed time per iteration (ms): 108154.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.603162E+00 | loss scale: 131072.0 | grad norm: 49046.611 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2817/ 292968 | consumed samples: 5769216 | consumed tokens: 794427392 | elapsed time per iteration (ms): 108012.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.621816E+00 | loss scale: 131072.0 | grad norm: 48359.482 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2818/ 292968 | consumed samples: 5771264 | consumed tokens: 794869760 | elapsed time per iteration (ms): 107550.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.619554E+00 | loss scale: 131072.0 | grad norm: 50805.764 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2819/ 292968 | consumed samples: 5773312 | consumed tokens: 795312128 | elapsed time per iteration (ms): 107088.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.610817E+00 | loss scale: 131072.0 | grad norm: 47589.046 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2820/ 292968 | consumed samples: 5775360 | consumed tokens: 795754496 | elapsed time per iteration (ms): 107312.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.622972E+00 | loss scale: 131072.0 | grad norm: 50583.835 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2821/ 292968 | consumed samples: 5777408 | consumed tokens: 796196864 | elapsed time per iteration (ms): 107105.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.607291E+00 | loss scale: 131072.0 | grad norm: 50096.811 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2822/ 292968 | consumed samples: 5779456 | consumed tokens: 796639232 | elapsed time per iteration (ms): 108276.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.591371E+00 | loss scale: 131072.0 | grad norm: 62174.852 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2823/ 292968 | consumed samples: 5781504 | consumed tokens: 797081600 | elapsed time per iteration (ms): 108737.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.616458E+00 | loss scale: 131072.0 | grad norm: 49909.754 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2824/ 292968 | consumed samples: 5783552 | consumed tokens: 797523968 | elapsed time per iteration (ms): 108378.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.609984E+00 | loss scale: 131072.0 | grad norm: 45037.512 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2825/ 292968 | consumed samples: 5785600 | consumed tokens: 797966336 | elapsed time per iteration (ms): 108579.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.599682E+00 | loss scale: 131072.0 | grad norm: 50603.875 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2826/ 292968 | consumed samples: 5787648 | consumed tokens: 798408704 | elapsed time per iteration (ms): 108993.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.590226E+00 | loss scale: 131072.0 | grad norm: 38121.980 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2827/ 292968 | consumed samples: 5789696 | consumed tokens: 798851072 | elapsed time per iteration (ms): 108645.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.610498E+00 | loss scale: 131072.0 | grad norm: 35446.185 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2828/ 292968 | consumed samples: 5791744 | consumed tokens: 799293440 | elapsed time per iteration (ms): 106874.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.625990E+00 | loss scale: 131072.0 | grad norm: 38682.359 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2829/ 292968 | consumed samples: 5793792 | consumed tokens: 799735808 | elapsed time per iteration (ms): 107321.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.613905E+00 | loss scale: 131072.0 | grad norm: 47641.936 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2830/ 292968 | consumed samples: 5795840 | consumed tokens: 800178176 | elapsed time per iteration (ms): 109505.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.617804E+00 | loss scale: 131072.0 | grad norm: 51658.457 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2831/ 292968 | consumed samples: 5797888 | consumed tokens: 800620544 | elapsed time per iteration (ms): 110232.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.622371E+00 | loss scale: 131072.0 | grad norm: 46156.375 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2832/ 292968 | consumed samples: 5799936 | consumed tokens: 801062912 | elapsed time per iteration (ms): 108491.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.610325E+00 | loss scale: 131072.0 | grad norm: 53753.610 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2833/ 292968 | consumed samples: 5801984 | consumed tokens: 801505280 | elapsed time per iteration (ms): 108964.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.578779E+00 | loss scale: 131072.0 | grad norm: 72010.915 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2834/ 292968 | consumed samples: 5804032 | consumed tokens: 801947648 | elapsed time per iteration (ms): 108346.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.618866E+00 | loss scale: 131072.0 | grad norm: 64329.295 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2835/ 292968 | consumed samples: 5806080 | consumed tokens: 802390016 | elapsed time per iteration (ms): 110946.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.604655E+00 | loss scale: 131072.0 | grad norm: 47907.307 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2836/ 292968 | consumed samples: 5808128 | consumed tokens: 802832384 | elapsed time per iteration (ms): 106704.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.606880E+00 | loss scale: 131072.0 | grad norm: 35398.349 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2837/ 292968 | consumed samples: 5810176 | consumed tokens: 803274752 | elapsed time per iteration (ms): 108522.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.604661E+00 | loss scale: 131072.0 | grad norm: 47411.375 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2838/ 292968 | consumed samples: 5812224 | consumed tokens: 803717120 | elapsed time per iteration (ms): 108295.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.615869E+00 | loss scale: 131072.0 | grad norm: 58693.709 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2839/ 292968 | consumed samples: 5814272 | consumed tokens: 804159488 | elapsed time per iteration (ms): 108560.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.595276E+00 | loss scale: 131072.0 | grad norm: 43139.469 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2840/ 292968 | consumed samples: 5816320 | consumed tokens: 804601856 | elapsed time per iteration (ms): 107201.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.597152E+00 | loss scale: 131072.0 | grad norm: 41496.067 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2841/ 292968 | consumed samples: 5818368 | consumed tokens: 805044224 | elapsed time per iteration (ms): 108413.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.609890E+00 | loss scale: 131072.0 | grad norm: 44816.731 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2842/ 292968 | consumed samples: 5820416 | consumed tokens: 805486592 | elapsed time per iteration (ms): 108818.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.601224E+00 | loss scale: 131072.0 | grad norm: 39062.566 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2843/ 292968 | consumed samples: 5822464 | consumed tokens: 805928960 | elapsed time per iteration (ms): 107270.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.591432E+00 | loss scale: 131072.0 | grad norm: 33878.289 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2844/ 292968 | consumed samples: 5824512 | consumed tokens: 806371328 | elapsed time per iteration (ms): 108902.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.619322E+00 | loss scale: 131072.0 | grad norm: 39437.591 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2845/ 292968 | consumed samples: 5826560 | consumed tokens: 806813696 | elapsed time per iteration (ms): 106725.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.606616E+00 | loss scale: 131072.0 | grad norm: 42773.236 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2846/ 292968 | consumed samples: 5828608 | consumed tokens: 807256064 | elapsed time per iteration (ms): 108307.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.596569E+00 | loss scale: 131072.0 | grad norm: 33925.936 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2847/ 292968 | consumed samples: 5830656 | consumed tokens: 807698432 | elapsed time per iteration (ms): 107571.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.619474E+00 | loss scale: 131072.0 | grad norm: 46628.677 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2848/ 292968 | consumed samples: 5832704 | consumed tokens: 808140800 | elapsed time per iteration (ms): 107924.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.592318E+00 | loss scale: 131072.0 | grad norm: 54908.898 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2849/ 292968 | consumed samples: 5834752 | consumed tokens: 808583168 | elapsed time per iteration (ms): 107387.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.621048E+00 | loss scale: 131072.0 | grad norm: 50151.881 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2850/ 292968 | consumed samples: 5836800 | consumed tokens: 809025536 | elapsed time per iteration (ms): 111974.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.596889E+00 | loss scale: 131072.0 | grad norm: 44661.913 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 2850 | lm loss value: 3.606561E+00 | lm loss PPL: 3.683916E+01 | ------------------------------------------------------------------------------------------------- - iteration 2851/ 292968 | consumed samples: 5838848 | consumed tokens: 809467904 | elapsed time per iteration (ms): 293262.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.610353E+00 | loss scale: 131072.0 | grad norm: 52953.229 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2852/ 292968 | consumed samples: 5840896 | consumed tokens: 809910272 | elapsed time per iteration (ms): 108048.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.577106E+00 | loss scale: 131072.0 | grad norm: 57248.338 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2853/ 292968 | consumed samples: 5842944 | consumed tokens: 810352640 | elapsed time per iteration (ms): 110033.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.628800E+00 | loss scale: 131072.0 | grad norm: 44396.908 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2854/ 292968 | consumed samples: 5844992 | consumed tokens: 810795008 | elapsed time per iteration (ms): 107044.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.618514E+00 | loss scale: 131072.0 | grad norm: 51329.408 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2855/ 292968 | consumed samples: 5847040 | consumed tokens: 811237376 | elapsed time per iteration (ms): 109521.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.606665E+00 | loss scale: 131072.0 | grad norm: 51695.807 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2856/ 292968 | consumed samples: 5849088 | consumed tokens: 811679744 | elapsed time per iteration (ms): 111520.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.605186E+00 | loss scale: 131072.0 | grad norm: 57127.939 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2857/ 292968 | consumed samples: 5851136 | consumed tokens: 812122112 | elapsed time per iteration (ms): 107747.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.583688E+00 | loss scale: 131072.0 | grad norm: 63648.292 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2858/ 292968 | consumed samples: 5853184 | consumed tokens: 812564480 | elapsed time per iteration (ms): 108317.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.613451E+00 | loss scale: 131072.0 | grad norm: 43952.876 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2859/ 292968 | consumed samples: 5855232 | consumed tokens: 813006848 | elapsed time per iteration (ms): 109330.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.604447E+00 | loss scale: 131072.0 | grad norm: 37740.856 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2860/ 292968 | consumed samples: 5857280 | consumed tokens: 813449216 | elapsed time per iteration (ms): 108709.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.603303E+00 | loss scale: 131072.0 | grad norm: 57458.306 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2861/ 292968 | consumed samples: 5859328 | consumed tokens: 813891584 | elapsed time per iteration (ms): 106650.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.608145E+00 | loss scale: 131072.0 | grad norm: 52393.401 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2862/ 292968 | consumed samples: 5861376 | consumed tokens: 814333952 | elapsed time per iteration (ms): 107681.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.616405E+00 | loss scale: 131072.0 | grad norm: 32894.656 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2863/ 292968 | consumed samples: 5863424 | consumed tokens: 814776320 | elapsed time per iteration (ms): 109563.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.610384E+00 | loss scale: 131072.0 | grad norm: 47021.383 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2864/ 292968 | consumed samples: 5865472 | consumed tokens: 815218688 | elapsed time per iteration (ms): 108385.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.590504E+00 | loss scale: 131072.0 | grad norm: 57256.087 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2865/ 292968 | consumed samples: 5867520 | consumed tokens: 815661056 | elapsed time per iteration (ms): 107444.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.592409E+00 | loss scale: 131072.0 | grad norm: 51456.682 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2866/ 292968 | consumed samples: 5869568 | consumed tokens: 816103424 | elapsed time per iteration (ms): 108344.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.593376E+00 | loss scale: 131072.0 | grad norm: 46659.862 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2867/ 292968 | consumed samples: 5871616 | consumed tokens: 816545792 | elapsed time per iteration (ms): 108466.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.595827E+00 | loss scale: 131072.0 | grad norm: 54325.109 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2868/ 292968 | consumed samples: 5873664 | consumed tokens: 816988160 | elapsed time per iteration (ms): 108576.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.601353E+00 | loss scale: 131072.0 | grad norm: 54910.998 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2869/ 292968 | consumed samples: 5875712 | consumed tokens: 817430528 | elapsed time per iteration (ms): 108279.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.604755E+00 | loss scale: 131072.0 | grad norm: 40035.085 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2870/ 292968 | consumed samples: 5877760 | consumed tokens: 817872896 | elapsed time per iteration (ms): 108226.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.596942E+00 | loss scale: 131072.0 | grad norm: 51633.638 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2871/ 292968 | consumed samples: 5879808 | consumed tokens: 818315264 | elapsed time per iteration (ms): 108799.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.568445E+00 | loss scale: 131072.0 | grad norm: 42668.057 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2872/ 292968 | consumed samples: 5881856 | consumed tokens: 818757632 | elapsed time per iteration (ms): 108322.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.596460E+00 | loss scale: 131072.0 | grad norm: 53089.201 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2873/ 292968 | consumed samples: 5883904 | consumed tokens: 819200000 | elapsed time per iteration (ms): 108308.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.605663E+00 | loss scale: 131072.0 | grad norm: 37640.119 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2874/ 292968 | consumed samples: 5885952 | consumed tokens: 819642368 | elapsed time per iteration (ms): 107165.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.600698E+00 | loss scale: 131072.0 | grad norm: 47902.376 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2875/ 292968 | consumed samples: 5888000 | consumed tokens: 820084736 | elapsed time per iteration (ms): 108537.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.576207E+00 | loss scale: 131072.0 | grad norm: 48318.518 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2876/ 292968 | consumed samples: 5890048 | consumed tokens: 820527104 | elapsed time per iteration (ms): 110065.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.606306E+00 | loss scale: 131072.0 | grad norm: 42505.226 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2877/ 292968 | consumed samples: 5892096 | consumed tokens: 820969472 | elapsed time per iteration (ms): 113153.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.564559E+00 | loss scale: 131072.0 | grad norm: 38448.848 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2878/ 292968 | consumed samples: 5894144 | consumed tokens: 821411840 | elapsed time per iteration (ms): 113414.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.603652E+00 | loss scale: 131072.0 | grad norm: 43235.868 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2879/ 292968 | consumed samples: 5896192 | consumed tokens: 821854208 | elapsed time per iteration (ms): 112942.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.610519E+00 | loss scale: 131072.0 | grad norm: 42753.475 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2880/ 292968 | consumed samples: 5898240 | consumed tokens: 822296576 | elapsed time per iteration (ms): 112330.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.585472E+00 | loss scale: 131072.0 | grad norm: 42997.431 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2881/ 292968 | consumed samples: 5900288 | consumed tokens: 822738944 | elapsed time per iteration (ms): 115850.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.589227E+00 | loss scale: 131072.0 | grad norm: 52735.283 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2882/ 292968 | consumed samples: 5902336 | consumed tokens: 823181312 | elapsed time per iteration (ms): 117352.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.587913E+00 | loss scale: 131072.0 | grad norm: 61993.163 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2883/ 292968 | consumed samples: 5904384 | consumed tokens: 823623680 | elapsed time per iteration (ms): 114588.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.606950E+00 | loss scale: 131072.0 | grad norm: 53318.929 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2884/ 292968 | consumed samples: 5906432 | consumed tokens: 824066048 | elapsed time per iteration (ms): 115613.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.593746E+00 | loss scale: 131072.0 | grad norm: 48226.915 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2885/ 292968 | consumed samples: 5908480 | consumed tokens: 824508416 | elapsed time per iteration (ms): 109905.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.576422E+00 | loss scale: 131072.0 | grad norm: 60282.054 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2886/ 292968 | consumed samples: 5910528 | consumed tokens: 824950784 | elapsed time per iteration (ms): 115631.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.622350E+00 | loss scale: 131072.0 | grad norm: 42503.319 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2887/ 292968 | consumed samples: 5912576 | consumed tokens: 825393152 | elapsed time per iteration (ms): 108735.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.583334E+00 | loss scale: 131072.0 | grad norm: 40759.786 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2888/ 292968 | consumed samples: 5914624 | consumed tokens: 825835520 | elapsed time per iteration (ms): 112183.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.613625E+00 | loss scale: 131072.0 | grad norm: 34984.835 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2889/ 292968 | consumed samples: 5916672 | consumed tokens: 826277888 | elapsed time per iteration (ms): 113530.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.601884E+00 | loss scale: 131072.0 | grad norm: 43762.554 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2890/ 292968 | consumed samples: 5918720 | consumed tokens: 826720256 | elapsed time per iteration (ms): 109708.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.586018E+00 | loss scale: 131072.0 | grad norm: 48665.333 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2891/ 292968 | consumed samples: 5920768 | consumed tokens: 827162624 | elapsed time per iteration (ms): 114402.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.575640E+00 | loss scale: 131072.0 | grad norm: 43950.399 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2892/ 292968 | consumed samples: 5922816 | consumed tokens: 827604992 | elapsed time per iteration (ms): 111457.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.607335E+00 | loss scale: 131072.0 | grad norm: 61011.506 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2893/ 292968 | consumed samples: 5924864 | consumed tokens: 828047360 | elapsed time per iteration (ms): 110627.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.592458E+00 | loss scale: 131072.0 | grad norm: 73838.143 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2894/ 292968 | consumed samples: 5926912 | consumed tokens: 828489728 | elapsed time per iteration (ms): 113893.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.611160E+00 | loss scale: 131072.0 | grad norm: 58267.867 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2895/ 292968 | consumed samples: 5928960 | consumed tokens: 828932096 | elapsed time per iteration (ms): 112896.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.601085E+00 | loss scale: 131072.0 | grad norm: 49950.770 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2896/ 292968 | consumed samples: 5931008 | consumed tokens: 829374464 | elapsed time per iteration (ms): 107416.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.613141E+00 | loss scale: 131072.0 | grad norm: 45955.891 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2897/ 292968 | consumed samples: 5933056 | consumed tokens: 829816832 | elapsed time per iteration (ms): 110770.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.567296E+00 | loss scale: 131072.0 | grad norm: 43192.510 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2898/ 292968 | consumed samples: 5935104 | consumed tokens: 830259200 | elapsed time per iteration (ms): 111199.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.630626E+00 | loss scale: 131072.0 | grad norm: 43365.188 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2899/ 292968 | consumed samples: 5937152 | consumed tokens: 830701568 | elapsed time per iteration (ms): 111515.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.616112E+00 | loss scale: 131072.0 | grad norm: 53164.334 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2900/ 292968 | consumed samples: 5939200 | consumed tokens: 831143936 | elapsed time per iteration (ms): 111860.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.550275E+00 | loss scale: 131072.0 | grad norm: 43188.227 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2901/ 292968 | consumed samples: 5941248 | consumed tokens: 831586304 | elapsed time per iteration (ms): 109949.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.630061E+00 | loss scale: 131072.0 | grad norm: 43277.850 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2902/ 292968 | consumed samples: 5943296 | consumed tokens: 832028672 | elapsed time per iteration (ms): 113684.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.576544E+00 | loss scale: 131072.0 | grad norm: 39475.347 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2903/ 292968 | consumed samples: 5945344 | consumed tokens: 832471040 | elapsed time per iteration (ms): 116121.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.561011E+00 | loss scale: 131072.0 | grad norm: 34748.511 | num zeros: 0.0 | curriculum seqlen: 216 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2904/ 292968 | consumed samples: 5947392 | consumed tokens: 832929792 | elapsed time per iteration (ms): 118927.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.593276E+00 | loss scale: 131072.0 | grad norm: 34844.779 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2905/ 292968 | consumed samples: 5949440 | consumed tokens: 833388544 | elapsed time per iteration (ms): 124123.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.567379E+00 | loss scale: 131072.0 | grad norm: 43492.029 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2906/ 292968 | consumed samples: 5951488 | consumed tokens: 833847296 | elapsed time per iteration (ms): 128176.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.609973E+00 | loss scale: 131072.0 | grad norm: 40838.194 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2907/ 292968 | consumed samples: 5953536 | consumed tokens: 834306048 | elapsed time per iteration (ms): 126622.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.602418E+00 | loss scale: 131072.0 | grad norm: 40460.023 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2908/ 292968 | consumed samples: 5955584 | consumed tokens: 834764800 | elapsed time per iteration (ms): 115314.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.603675E+00 | loss scale: 131072.0 | grad norm: 38787.744 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2909/ 292968 | consumed samples: 5957632 | consumed tokens: 835223552 | elapsed time per iteration (ms): 114147.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.562258E+00 | loss scale: 131072.0 | grad norm: 35660.082 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2910/ 292968 | consumed samples: 5959680 | consumed tokens: 835682304 | elapsed time per iteration (ms): 115825.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.584078E+00 | loss scale: 131072.0 | grad norm: 38497.407 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2911/ 292968 | consumed samples: 5961728 | consumed tokens: 836141056 | elapsed time per iteration (ms): 114407.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.603372E+00 | loss scale: 131072.0 | grad norm: 42956.135 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2912/ 292968 | consumed samples: 5963776 | consumed tokens: 836599808 | elapsed time per iteration (ms): 113579.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.578610E+00 | loss scale: 131072.0 | grad norm: 44572.043 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2913/ 292968 | consumed samples: 5965824 | consumed tokens: 837058560 | elapsed time per iteration (ms): 109699.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.590713E+00 | loss scale: 131072.0 | grad norm: 44567.901 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2914/ 292968 | consumed samples: 5967872 | consumed tokens: 837517312 | elapsed time per iteration (ms): 114497.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.588710E+00 | loss scale: 131072.0 | grad norm: 36749.140 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2915/ 292968 | consumed samples: 5969920 | consumed tokens: 837976064 | elapsed time per iteration (ms): 113965.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.591449E+00 | loss scale: 131072.0 | grad norm: 46845.472 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2916/ 292968 | consumed samples: 5971968 | consumed tokens: 838434816 | elapsed time per iteration (ms): 113852.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.586223E+00 | loss scale: 131072.0 | grad norm: 48040.295 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2917/ 292968 | consumed samples: 5974016 | consumed tokens: 838893568 | elapsed time per iteration (ms): 109084.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.598637E+00 | loss scale: 131072.0 | grad norm: 42287.570 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2918/ 292968 | consumed samples: 5976064 | consumed tokens: 839352320 | elapsed time per iteration (ms): 115093.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.581700E+00 | loss scale: 131072.0 | grad norm: 40026.847 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2919/ 292968 | consumed samples: 5978112 | consumed tokens: 839811072 | elapsed time per iteration (ms): 109965.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.592082E+00 | loss scale: 131072.0 | grad norm: 47723.590 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2920/ 292968 | consumed samples: 5980160 | consumed tokens: 840269824 | elapsed time per iteration (ms): 109869.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.585698E+00 | loss scale: 131072.0 | grad norm: 63660.993 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2921/ 292968 | consumed samples: 5982208 | consumed tokens: 840728576 | elapsed time per iteration (ms): 115438.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.588047E+00 | loss scale: 131072.0 | grad norm: 56636.477 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2922/ 292968 | consumed samples: 5984256 | consumed tokens: 841187328 | elapsed time per iteration (ms): 111086.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.586881E+00 | loss scale: 131072.0 | grad norm: 51816.533 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2923/ 292968 | consumed samples: 5986304 | consumed tokens: 841646080 | elapsed time per iteration (ms): 109522.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.569388E+00 | loss scale: 131072.0 | grad norm: 44686.943 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2924/ 292968 | consumed samples: 5988352 | consumed tokens: 842104832 | elapsed time per iteration (ms): 114370.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.574268E+00 | loss scale: 131072.0 | grad norm: 49260.288 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2925/ 292968 | consumed samples: 5990400 | consumed tokens: 842563584 | elapsed time per iteration (ms): 109770.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.601225E+00 | loss scale: 131072.0 | grad norm: 43825.150 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2926/ 292968 | consumed samples: 5992448 | consumed tokens: 843022336 | elapsed time per iteration (ms): 111148.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.618695E+00 | loss scale: 131072.0 | grad norm: 33164.313 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2927/ 292968 | consumed samples: 5994496 | consumed tokens: 843481088 | elapsed time per iteration (ms): 109933.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.567786E+00 | loss scale: 131072.0 | grad norm: 43929.632 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2928/ 292968 | consumed samples: 5996544 | consumed tokens: 843939840 | elapsed time per iteration (ms): 109596.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.596983E+00 | loss scale: 131072.0 | grad norm: 39738.615 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2929/ 292968 | consumed samples: 5998592 | consumed tokens: 844398592 | elapsed time per iteration (ms): 108939.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.583142E+00 | loss scale: 131072.0 | grad norm: 49234.468 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2930/ 292968 | consumed samples: 6000640 | consumed tokens: 844857344 | elapsed time per iteration (ms): 108569.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.579193E+00 | loss scale: 131072.0 | grad norm: 59939.532 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2931/ 292968 | consumed samples: 6002688 | consumed tokens: 845316096 | elapsed time per iteration (ms): 110324.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.598184E+00 | loss scale: 131072.0 | grad norm: 51423.146 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2932/ 292968 | consumed samples: 6004736 | consumed tokens: 845774848 | elapsed time per iteration (ms): 109051.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.588955E+00 | loss scale: 131072.0 | grad norm: 34095.438 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2933/ 292968 | consumed samples: 6006784 | consumed tokens: 846233600 | elapsed time per iteration (ms): 112498.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.552613E+00 | loss scale: 131072.0 | grad norm: 46413.050 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2934/ 292968 | consumed samples: 6008832 | consumed tokens: 846692352 | elapsed time per iteration (ms): 113705.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.543072E+00 | loss scale: 131072.0 | grad norm: 37885.740 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2935/ 292968 | consumed samples: 6010880 | consumed tokens: 847151104 | elapsed time per iteration (ms): 111771.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.579232E+00 | loss scale: 131072.0 | grad norm: 38377.831 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2936/ 292968 | consumed samples: 6012928 | consumed tokens: 847609856 | elapsed time per iteration (ms): 111097.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.573367E+00 | loss scale: 131072.0 | grad norm: 68023.928 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2937/ 292968 | consumed samples: 6014976 | consumed tokens: 848068608 | elapsed time per iteration (ms): 111594.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.605148E+00 | loss scale: 131072.0 | grad norm: 62448.239 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2938/ 292968 | consumed samples: 6017024 | consumed tokens: 848527360 | elapsed time per iteration (ms): 108341.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.598902E+00 | loss scale: 131072.0 | grad norm: 44643.002 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2939/ 292968 | consumed samples: 6019072 | consumed tokens: 848986112 | elapsed time per iteration (ms): 112222.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.569798E+00 | loss scale: 131072.0 | grad norm: 45290.973 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2940/ 292968 | consumed samples: 6021120 | consumed tokens: 849444864 | elapsed time per iteration (ms): 115954.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.569247E+00 | loss scale: 131072.0 | grad norm: 46322.512 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2941/ 292968 | consumed samples: 6023168 | consumed tokens: 849903616 | elapsed time per iteration (ms): 112653.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.577928E+00 | loss scale: 131072.0 | grad norm: 46443.426 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2942/ 292968 | consumed samples: 6025216 | consumed tokens: 850362368 | elapsed time per iteration (ms): 110347.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.581179E+00 | loss scale: 131072.0 | grad norm: 37528.387 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2943/ 292968 | consumed samples: 6027264 | consumed tokens: 850821120 | elapsed time per iteration (ms): 112411.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.552205E+00 | loss scale: 131072.0 | grad norm: 35154.633 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2944/ 292968 | consumed samples: 6029312 | consumed tokens: 851279872 | elapsed time per iteration (ms): 110084.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.557537E+00 | loss scale: 131072.0 | grad norm: 38577.193 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2945/ 292968 | consumed samples: 6031360 | consumed tokens: 851738624 | elapsed time per iteration (ms): 109783.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.586143E+00 | loss scale: 131072.0 | grad norm: 31978.417 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2946/ 292968 | consumed samples: 6033408 | consumed tokens: 852197376 | elapsed time per iteration (ms): 109476.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.539176E+00 | loss scale: 131072.0 | grad norm: 41717.446 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2947/ 292968 | consumed samples: 6035456 | consumed tokens: 852656128 | elapsed time per iteration (ms): 106709.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.593690E+00 | loss scale: 131072.0 | grad norm: 47370.527 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2948/ 292968 | consumed samples: 6037504 | consumed tokens: 853114880 | elapsed time per iteration (ms): 108926.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.575230E+00 | loss scale: 131072.0 | grad norm: 49222.100 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2949/ 292968 | consumed samples: 6039552 | consumed tokens: 853573632 | elapsed time per iteration (ms): 109315.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.563631E+00 | loss scale: 131072.0 | grad norm: 43634.421 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2950/ 292968 | consumed samples: 6041600 | consumed tokens: 854032384 | elapsed time per iteration (ms): 108943.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.581049E+00 | loss scale: 131072.0 | grad norm: 37329.440 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2951/ 292968 | consumed samples: 6043648 | consumed tokens: 854491136 | elapsed time per iteration (ms): 110552.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.579678E+00 | loss scale: 131072.0 | grad norm: 44014.353 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2952/ 292968 | consumed samples: 6045696 | consumed tokens: 854949888 | elapsed time per iteration (ms): 108183.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.579152E+00 | loss scale: 131072.0 | grad norm: 46173.935 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2953/ 292968 | consumed samples: 6047744 | consumed tokens: 855408640 | elapsed time per iteration (ms): 109332.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.577377E+00 | loss scale: 131072.0 | grad norm: 50786.089 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2954/ 292968 | consumed samples: 6049792 | consumed tokens: 855867392 | elapsed time per iteration (ms): 111339.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.571541E+00 | loss scale: 131072.0 | grad norm: 51713.195 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2955/ 292968 | consumed samples: 6051840 | consumed tokens: 856326144 | elapsed time per iteration (ms): 112592.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.568782E+00 | loss scale: 131072.0 | grad norm: 47013.905 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2956/ 292968 | consumed samples: 6053888 | consumed tokens: 856784896 | elapsed time per iteration (ms): 118278.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.592543E+00 | loss scale: 131072.0 | grad norm: 68652.320 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2957/ 292968 | consumed samples: 6055936 | consumed tokens: 857243648 | elapsed time per iteration (ms): 116456.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.581419E+00 | loss scale: 131072.0 | grad norm: 67465.027 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2958/ 292968 | consumed samples: 6057984 | consumed tokens: 857702400 | elapsed time per iteration (ms): 115503.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.593701E+00 | loss scale: 131072.0 | grad norm: 40407.212 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2959/ 292968 | consumed samples: 6060032 | consumed tokens: 858161152 | elapsed time per iteration (ms): 107881.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.574839E+00 | loss scale: 131072.0 | grad norm: 47071.321 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2960/ 292968 | consumed samples: 6062080 | consumed tokens: 858619904 | elapsed time per iteration (ms): 108857.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.592325E+00 | loss scale: 131072.0 | grad norm: 35885.281 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2961/ 292968 | consumed samples: 6064128 | consumed tokens: 859078656 | elapsed time per iteration (ms): 110232.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.570963E+00 | loss scale: 131072.0 | grad norm: 41837.989 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2962/ 292968 | consumed samples: 6066176 | consumed tokens: 859537408 | elapsed time per iteration (ms): 108405.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.568669E+00 | loss scale: 131072.0 | grad norm: 40338.380 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2963/ 292968 | consumed samples: 6068224 | consumed tokens: 859996160 | elapsed time per iteration (ms): 111255.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.569066E+00 | loss scale: 131072.0 | grad norm: 37378.356 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2964/ 292968 | consumed samples: 6070272 | consumed tokens: 860454912 | elapsed time per iteration (ms): 109133.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.573036E+00 | loss scale: 131072.0 | grad norm: 34300.355 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2965/ 292968 | consumed samples: 6072320 | consumed tokens: 860913664 | elapsed time per iteration (ms): 111700.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.582489E+00 | loss scale: 131072.0 | grad norm: 37110.802 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2966/ 292968 | consumed samples: 6074368 | consumed tokens: 861372416 | elapsed time per iteration (ms): 111653.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.566869E+00 | loss scale: 131072.0 | grad norm: 46214.171 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2967/ 292968 | consumed samples: 6076416 | consumed tokens: 861831168 | elapsed time per iteration (ms): 109532.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.543047E+00 | loss scale: 131072.0 | grad norm: 53160.973 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2968/ 292968 | consumed samples: 6078464 | consumed tokens: 862289920 | elapsed time per iteration (ms): 108516.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.566134E+00 | loss scale: 131072.0 | grad norm: 50371.823 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2969/ 292968 | consumed samples: 6080512 | consumed tokens: 862748672 | elapsed time per iteration (ms): 109361.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.569240E+00 | loss scale: 131072.0 | grad norm: 34458.234 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2970/ 292968 | consumed samples: 6082560 | consumed tokens: 863207424 | elapsed time per iteration (ms): 108385.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.566778E+00 | loss scale: 131072.0 | grad norm: 37907.014 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2971/ 292968 | consumed samples: 6084608 | consumed tokens: 863666176 | elapsed time per iteration (ms): 108789.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.555124E+00 | loss scale: 131072.0 | grad norm: 42524.268 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2972/ 292968 | consumed samples: 6086656 | consumed tokens: 864124928 | elapsed time per iteration (ms): 108934.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.574265E+00 | loss scale: 131072.0 | grad norm: 39274.079 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2973/ 292968 | consumed samples: 6088704 | consumed tokens: 864583680 | elapsed time per iteration (ms): 109187.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.555774E+00 | loss scale: 131072.0 | grad norm: 38238.717 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2974/ 292968 | consumed samples: 6090752 | consumed tokens: 865042432 | elapsed time per iteration (ms): 110511.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.555067E+00 | loss scale: 131072.0 | grad norm: 39191.315 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2975/ 292968 | consumed samples: 6092800 | consumed tokens: 865501184 | elapsed time per iteration (ms): 107761.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.581964E+00 | loss scale: 131072.0 | grad norm: 36178.655 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2976/ 292968 | consumed samples: 6094848 | consumed tokens: 865959936 | elapsed time per iteration (ms): 109101.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.561371E+00 | loss scale: 131072.0 | grad norm: 38018.147 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2977/ 292968 | consumed samples: 6096896 | consumed tokens: 866418688 | elapsed time per iteration (ms): 108339.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.563130E+00 | loss scale: 131072.0 | grad norm: 40462.511 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2978/ 292968 | consumed samples: 6098944 | consumed tokens: 866877440 | elapsed time per iteration (ms): 109528.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.573788E+00 | loss scale: 131072.0 | grad norm: 43896.246 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2979/ 292968 | consumed samples: 6100992 | consumed tokens: 867336192 | elapsed time per iteration (ms): 106931.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.545753E+00 | loss scale: 131072.0 | grad norm: 48392.502 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2980/ 292968 | consumed samples: 6103040 | consumed tokens: 867794944 | elapsed time per iteration (ms): 108092.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.581406E+00 | loss scale: 131072.0 | grad norm: 44963.758 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2981/ 292968 | consumed samples: 6105088 | consumed tokens: 868253696 | elapsed time per iteration (ms): 108385.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.567360E+00 | loss scale: 131072.0 | grad norm: 44071.074 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2982/ 292968 | consumed samples: 6107136 | consumed tokens: 868712448 | elapsed time per iteration (ms): 109239.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.569098E+00 | loss scale: 131072.0 | grad norm: 54621.671 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2983/ 292968 | consumed samples: 6109184 | consumed tokens: 869171200 | elapsed time per iteration (ms): 109125.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.564108E+00 | loss scale: 131072.0 | grad norm: 51907.478 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2984/ 292968 | consumed samples: 6111232 | consumed tokens: 869629952 | elapsed time per iteration (ms): 108790.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.585078E+00 | loss scale: 131072.0 | grad norm: 46261.555 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2985/ 292968 | consumed samples: 6113280 | consumed tokens: 870088704 | elapsed time per iteration (ms): 109176.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.551283E+00 | loss scale: 131072.0 | grad norm: 49454.560 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2986/ 292968 | consumed samples: 6115328 | consumed tokens: 870547456 | elapsed time per iteration (ms): 109128.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.598526E+00 | loss scale: 131072.0 | grad norm: 42870.425 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2987/ 292968 | consumed samples: 6117376 | consumed tokens: 871006208 | elapsed time per iteration (ms): 112759.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.569285E+00 | loss scale: 131072.0 | grad norm: 38108.923 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2988/ 292968 | consumed samples: 6119424 | consumed tokens: 871464960 | elapsed time per iteration (ms): 110772.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.537183E+00 | loss scale: 131072.0 | grad norm: 38268.666 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2989/ 292968 | consumed samples: 6121472 | consumed tokens: 871923712 | elapsed time per iteration (ms): 113891.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.538945E+00 | loss scale: 131072.0 | grad norm: 36600.547 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2990/ 292968 | consumed samples: 6123520 | consumed tokens: 872382464 | elapsed time per iteration (ms): 109876.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.546379E+00 | loss scale: 131072.0 | grad norm: 35165.563 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2991/ 292968 | consumed samples: 6125568 | consumed tokens: 872841216 | elapsed time per iteration (ms): 115377.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.587996E+00 | loss scale: 131072.0 | grad norm: 37518.416 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2992/ 292968 | consumed samples: 6127616 | consumed tokens: 873299968 | elapsed time per iteration (ms): 109282.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.585268E+00 | loss scale: 131072.0 | grad norm: 39054.225 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2993/ 292968 | consumed samples: 6129664 | consumed tokens: 873758720 | elapsed time per iteration (ms): 109391.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.578841E+00 | loss scale: 131072.0 | grad norm: 43147.170 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2994/ 292968 | consumed samples: 6131712 | consumed tokens: 874217472 | elapsed time per iteration (ms): 107969.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.575021E+00 | loss scale: 131072.0 | grad norm: 58412.414 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2995/ 292968 | consumed samples: 6133760 | consumed tokens: 874676224 | elapsed time per iteration (ms): 107960.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.545457E+00 | loss scale: 131072.0 | grad norm: 77786.664 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2996/ 292968 | consumed samples: 6135808 | consumed tokens: 875134976 | elapsed time per iteration (ms): 109792.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.560975E+00 | loss scale: 131072.0 | grad norm: 66351.255 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2997/ 292968 | consumed samples: 6137856 | consumed tokens: 875593728 | elapsed time per iteration (ms): 107299.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.549161E+00 | loss scale: 131072.0 | grad norm: 41282.400 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2998/ 292968 | consumed samples: 6139904 | consumed tokens: 876052480 | elapsed time per iteration (ms): 107801.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.583200E+00 | loss scale: 131072.0 | grad norm: 53236.886 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2999/ 292968 | consumed samples: 6141952 | consumed tokens: 876511232 | elapsed time per iteration (ms): 108443.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.559494E+00 | loss scale: 131072.0 | grad norm: 40175.908 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3000/ 292968 | consumed samples: 6144000 | consumed tokens: 876969984 | elapsed time per iteration (ms): 108892.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.548572E+00 | loss scale: 262144.0 | grad norm: 40949.754 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 3000 | lm loss value: 3.549588E+00 | lm loss PPL: 3.479899E+01 | ------------------------------------------------------------------------------------------------- -saving checkpoint at iteration 3000 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-30 03:56:18,609] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/mp_rank_01_model_states.pt -[2021-10-30 03:56:20,270] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/mp_rank_00_model_states.pt -[2021-10-30 03:56:33,039] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-30 03:56:33,075] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-30 03:56:33,081] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-30 03:56:33,100] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-30 03:56:33,117] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-30 03:56:33,119] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-30 03:56:33,138] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-30 03:56:33,164] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-30 03:56:33,189] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-30 03:56:33,238] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-30 03:56:33,282] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-30 03:56:33,309] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-30 03:56:33,337] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-30 03:56:33,347] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-30 03:56:33,354] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-30 03:56:33,366] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-30 03:56:33,366] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-30 03:56:33,435] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-30 03:56:33,465] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-30 03:56:33,472] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-30 03:56:33,474] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-30 03:56:33,498] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-30 03:56:33,510] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-30 03:56:33,525] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-30 03:56:33,536] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-30 03:56:33,608] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-30 03:56:33,613] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-30 03:56:33,684] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-30 03:56:33,709] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-30 03:56:33,719] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-30 03:56:33,750] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-30 03:56:33,786] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-30 03:56:33,789] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-30 03:56:33,812] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-30 03:56:33,850] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-30 03:56:33,856] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-30 03:56:33,901] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-30 03:56:33,944] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-30 03:56:34,000] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-30 03:56:34,122] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-30 03:56:34,195] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-30 03:56:34,222] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-30 03:56:34,247] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-30 03:56:34,267] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-30 03:56:34,283] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-30 03:56:34,295] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-30 03:56:34,304] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-30 03:56:34,308] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-30 03:56:34,314] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-30 03:56:34,357] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-30 03:56:34,375] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-30 03:56:34,383] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-30 03:56:34,406] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-30 03:56:34,406] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-30 03:56:34,429] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-30 03:56:34,430] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-30 03:56:34,445] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-30 03:56:34,456] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-30 03:56:34,467] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-30 03:56:34,476] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-30 03:56:34,479] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-30 03:56:34,492] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-30 03:56:34,500] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-30 03:56:34,528] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-30 03:56:34,534] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-30 03:56:34,538] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-30 03:56:34,558] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-30 03:56:34,566] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-30 03:56:34,566] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-30 03:56:34,567] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-30 03:56:34,598] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-30 03:56:34,603] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-30 03:56:34,605] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-30 03:56:34,623] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-30 03:56:34,626] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-30 03:56:34,628] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-30 03:56:34,640] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-30 03:56:34,654] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-30 03:56:34,663] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-30 03:56:34,679] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-30 03:56:34,686] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-30 03:56:34,688] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-30 03:56:34,771] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-30 03:56:34,805] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-30 03:56:34,828] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-30 03:56:34,846] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-30 03:56:34,911] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-30 03:56:34,925] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-30 03:56:34,935] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-30 03:56:34,963] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-30 03:56:34,994] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-30 03:56:35,078] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-30 03:56:35,262] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-30 03:56:35,516] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-30 03:56:35,633] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-30 03:56:36,092] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-30 03:56:36,533] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-30 03:56:36,696] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-30 03:56:37,006] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-30 03:56:37,444] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-30 03:56:37,750] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-30 03:56:38,252] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-30 03:56:38,400] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-30 03:56:38,919] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-30 03:56:38,947] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-30 03:56:39,106] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-30 03:56:39,460] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-30 03:56:39,586] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-30 03:56:39,861] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-30 03:56:40,831] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-30 03:56:40,949] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_03_optim_states.pt -[2021-10-30 03:56:41,006] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-30 03:56:41,772] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-30 03:56:41,817] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-30 03:56:42,059] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-30 03:56:42,088] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-30 03:56:42,131] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-30 03:56:42,613] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-30 03:56:43,004] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-30 03:56:43,105] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-30 03:56:43,901] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-30 03:56:44,010] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-30 03:56:44,634] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-30 03:56:45,772] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-30 03:56:46,068] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-30 03:56:47,415] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-30 03:56:47,419] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-30 03:56:48,810] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3000/zero_pp_rank_0_mp_rank_126_optim_states.pt - successfully saved checkpoint at iteration 3000 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 33235.87 - iteration 3001/ 292968 | consumed samples: 6146048 | consumed tokens: 877428736 | elapsed time per iteration (ms): 321200.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.549768E+00 | loss scale: 262144.0 | grad norm: 81690.481 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3002/ 292968 | consumed samples: 6148096 | consumed tokens: 877887488 | elapsed time per iteration (ms): 107943.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.538311E+00 | loss scale: 262144.0 | grad norm: 102831.829 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3003/ 292968 | consumed samples: 6150144 | consumed tokens: 878346240 | elapsed time per iteration (ms): 108767.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.574532E+00 | loss scale: 262144.0 | grad norm: 117110.170 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3004/ 292968 | consumed samples: 6152192 | consumed tokens: 878804992 | elapsed time per iteration (ms): 107881.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.548190E+00 | loss scale: 262144.0 | grad norm: 116478.255 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3005/ 292968 | consumed samples: 6154240 | consumed tokens: 879263744 | elapsed time per iteration (ms): 108817.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.579916E+00 | loss scale: 262144.0 | grad norm: 86900.749 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3006/ 292968 | consumed samples: 6156288 | consumed tokens: 879722496 | elapsed time per iteration (ms): 108938.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.544670E+00 | loss scale: 262144.0 | grad norm: 82684.901 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3007/ 292968 | consumed samples: 6158336 | consumed tokens: 880181248 | elapsed time per iteration (ms): 107957.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.590149E+00 | loss scale: 262144.0 | grad norm: 80772.086 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3008/ 292968 | consumed samples: 6160384 | consumed tokens: 880640000 | elapsed time per iteration (ms): 107562.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.553613E+00 | loss scale: 262144.0 | grad norm: 89012.147 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3009/ 292968 | consumed samples: 6162432 | consumed tokens: 881098752 | elapsed time per iteration (ms): 108955.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.583788E+00 | loss scale: 262144.0 | grad norm: 89927.439 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3010/ 292968 | consumed samples: 6164480 | consumed tokens: 881557504 | elapsed time per iteration (ms): 110441.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.545923E+00 | loss scale: 262144.0 | grad norm: 71626.926 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3011/ 292968 | consumed samples: 6166528 | consumed tokens: 882016256 | elapsed time per iteration (ms): 117605.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.545170E+00 | loss scale: 262144.0 | grad norm: 90399.392 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3012/ 292968 | consumed samples: 6168576 | consumed tokens: 882475008 | elapsed time per iteration (ms): 126722.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.554401E+00 | loss scale: 262144.0 | grad norm: 109348.469 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3013/ 292968 | consumed samples: 6170624 | consumed tokens: 882933760 | elapsed time per iteration (ms): 130893.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.550539E+00 | loss scale: 262144.0 | grad norm: 92562.218 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3014/ 292968 | consumed samples: 6172672 | consumed tokens: 883392512 | elapsed time per iteration (ms): 125397.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.567821E+00 | loss scale: 262144.0 | grad norm: 68736.852 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3015/ 292968 | consumed samples: 6174720 | consumed tokens: 883851264 | elapsed time per iteration (ms): 111363.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.542690E+00 | loss scale: 262144.0 | grad norm: 82772.979 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3016/ 292968 | consumed samples: 6176768 | consumed tokens: 884310016 | elapsed time per iteration (ms): 112539.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.560461E+00 | loss scale: 262144.0 | grad norm: 87178.692 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3017/ 292968 | consumed samples: 6178816 | consumed tokens: 884768768 | elapsed time per iteration (ms): 110032.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.547208E+00 | loss scale: 262144.0 | grad norm: 101851.665 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3018/ 292968 | consumed samples: 6180864 | consumed tokens: 885227520 | elapsed time per iteration (ms): 111979.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.574103E+00 | loss scale: 262144.0 | grad norm: 100148.177 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3019/ 292968 | consumed samples: 6182912 | consumed tokens: 885686272 | elapsed time per iteration (ms): 108474.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.570436E+00 | loss scale: 262144.0 | grad norm: 74941.147 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3020/ 292968 | consumed samples: 6184960 | consumed tokens: 886145024 | elapsed time per iteration (ms): 108347.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.564288E+00 | loss scale: 262144.0 | grad norm: 67732.383 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3021/ 292968 | consumed samples: 6187008 | consumed tokens: 886603776 | elapsed time per iteration (ms): 107167.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.550686E+00 | loss scale: 262144.0 | grad norm: 75716.783 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3022/ 292968 | consumed samples: 6189056 | consumed tokens: 887062528 | elapsed time per iteration (ms): 107313.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.566140E+00 | loss scale: 262144.0 | grad norm: 85551.713 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3023/ 292968 | consumed samples: 6191104 | consumed tokens: 887521280 | elapsed time per iteration (ms): 109501.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.545317E+00 | loss scale: 262144.0 | grad norm: 87524.974 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3024/ 292968 | consumed samples: 6193152 | consumed tokens: 887980032 | elapsed time per iteration (ms): 109283.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.560166E+00 | loss scale: 262144.0 | grad norm: 89544.840 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3025/ 292968 | consumed samples: 6195200 | consumed tokens: 888438784 | elapsed time per iteration (ms): 108325.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.582637E+00 | loss scale: 262144.0 | grad norm: 82233.550 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3026/ 292968 | consumed samples: 6197248 | consumed tokens: 888897536 | elapsed time per iteration (ms): 110301.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.557073E+00 | loss scale: 262144.0 | grad norm: 84754.291 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3027/ 292968 | consumed samples: 6199296 | consumed tokens: 889356288 | elapsed time per iteration (ms): 109348.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.540827E+00 | loss scale: 262144.0 | grad norm: 92400.021 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3028/ 292968 | consumed samples: 6201344 | consumed tokens: 889815040 | elapsed time per iteration (ms): 109014.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.558088E+00 | loss scale: 262144.0 | grad norm: 70058.283 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3029/ 292968 | consumed samples: 6203392 | consumed tokens: 890273792 | elapsed time per iteration (ms): 107670.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.524401E+00 | loss scale: 262144.0 | grad norm: 92297.000 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3030/ 292968 | consumed samples: 6205440 | consumed tokens: 890732544 | elapsed time per iteration (ms): 111195.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.572511E+00 | loss scale: 262144.0 | grad norm: 111312.025 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3031/ 292968 | consumed samples: 6207488 | consumed tokens: 891191296 | elapsed time per iteration (ms): 109701.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.559071E+00 | loss scale: 262144.0 | grad norm: 124737.679 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3032/ 292968 | consumed samples: 6209536 | consumed tokens: 891650048 | elapsed time per iteration (ms): 109642.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.585852E+00 | loss scale: 262144.0 | grad norm: 109395.490 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3033/ 292968 | consumed samples: 6211584 | consumed tokens: 892108800 | elapsed time per iteration (ms): 108811.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.568167E+00 | loss scale: 262144.0 | grad norm: 92487.259 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3034/ 292968 | consumed samples: 6213632 | consumed tokens: 892567552 | elapsed time per iteration (ms): 107832.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.550674E+00 | loss scale: 262144.0 | grad norm: 80016.713 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3035/ 292968 | consumed samples: 6215680 | consumed tokens: 893026304 | elapsed time per iteration (ms): 108582.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.572785E+00 | loss scale: 262144.0 | grad norm: 68786.085 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3036/ 292968 | consumed samples: 6217728 | consumed tokens: 893485056 | elapsed time per iteration (ms): 107024.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.546546E+00 | loss scale: 262144.0 | grad norm: 92291.113 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3037/ 292968 | consumed samples: 6219776 | consumed tokens: 893943808 | elapsed time per iteration (ms): 109050.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.578622E+00 | loss scale: 262144.0 | grad norm: 119461.227 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3038/ 292968 | consumed samples: 6221824 | consumed tokens: 894402560 | elapsed time per iteration (ms): 108808.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.544511E+00 | loss scale: 262144.0 | grad norm: 116278.091 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3039/ 292968 | consumed samples: 6223872 | consumed tokens: 894861312 | elapsed time per iteration (ms): 108850.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.548750E+00 | loss scale: 262144.0 | grad norm: 86552.533 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3040/ 292968 | consumed samples: 6225920 | consumed tokens: 895320064 | elapsed time per iteration (ms): 108776.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.570335E+00 | loss scale: 262144.0 | grad norm: 85731.354 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3041/ 292968 | consumed samples: 6227968 | consumed tokens: 895778816 | elapsed time per iteration (ms): 111196.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.553517E+00 | loss scale: 262144.0 | grad norm: 105048.294 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3042/ 292968 | consumed samples: 6230016 | consumed tokens: 896237568 | elapsed time per iteration (ms): 107002.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.538068E+00 | loss scale: 262144.0 | grad norm: 98084.936 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3043/ 292968 | consumed samples: 6232064 | consumed tokens: 896696320 | elapsed time per iteration (ms): 107527.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.554549E+00 | loss scale: 262144.0 | grad norm: 71460.429 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3044/ 292968 | consumed samples: 6234112 | consumed tokens: 897155072 | elapsed time per iteration (ms): 108629.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.557918E+00 | loss scale: 262144.0 | grad norm: 96204.536 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3045/ 292968 | consumed samples: 6236160 | consumed tokens: 897613824 | elapsed time per iteration (ms): 109600.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.548891E+00 | loss scale: 262144.0 | grad norm: 92450.730 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3046/ 292968 | consumed samples: 6238208 | consumed tokens: 898072576 | elapsed time per iteration (ms): 107059.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.539190E+00 | loss scale: 262144.0 | grad norm: 95690.474 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3047/ 292968 | consumed samples: 6240256 | consumed tokens: 898531328 | elapsed time per iteration (ms): 108981.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.572068E+00 | loss scale: 262144.0 | grad norm: 79860.685 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3048/ 292968 | consumed samples: 6242304 | consumed tokens: 898990080 | elapsed time per iteration (ms): 108035.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.548295E+00 | loss scale: 262144.0 | grad norm: 64003.947 | num zeros: 0.0 | curriculum seqlen: 224 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3049/ 292968 | consumed samples: 6244352 | consumed tokens: 899465216 | elapsed time per iteration (ms): 110029.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.545928E+00 | loss scale: 262144.0 | grad norm: 87441.257 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3050/ 292968 | consumed samples: 6246400 | consumed tokens: 899940352 | elapsed time per iteration (ms): 112465.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.549430E+00 | loss scale: 262144.0 | grad norm: 97614.635 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3051/ 292968 | consumed samples: 6248448 | consumed tokens: 900415488 | elapsed time per iteration (ms): 110475.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.538041E+00 | loss scale: 262144.0 | grad norm: 89596.321 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3052/ 292968 | consumed samples: 6250496 | consumed tokens: 900890624 | elapsed time per iteration (ms): 110216.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.549283E+00 | loss scale: 262144.0 | grad norm: 79236.208 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3053/ 292968 | consumed samples: 6252544 | consumed tokens: 901365760 | elapsed time per iteration (ms): 113292.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.563254E+00 | loss scale: 262144.0 | grad norm: 65109.373 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3054/ 292968 | consumed samples: 6254592 | consumed tokens: 901840896 | elapsed time per iteration (ms): 110325.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.545700E+00 | loss scale: 262144.0 | grad norm: 66966.407 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3055/ 292968 | consumed samples: 6256640 | consumed tokens: 902316032 | elapsed time per iteration (ms): 110759.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.556716E+00 | loss scale: 262144.0 | grad norm: 62610.480 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3056/ 292968 | consumed samples: 6258688 | consumed tokens: 902791168 | elapsed time per iteration (ms): 110878.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.565084E+00 | loss scale: 262144.0 | grad norm: 68449.470 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3057/ 292968 | consumed samples: 6260736 | consumed tokens: 903266304 | elapsed time per iteration (ms): 111826.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.560848E+00 | loss scale: 262144.0 | grad norm: 77113.926 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3058/ 292968 | consumed samples: 6262784 | consumed tokens: 903741440 | elapsed time per iteration (ms): 109611.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.559057E+00 | loss scale: 262144.0 | grad norm: 70060.144 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3059/ 292968 | consumed samples: 6264832 | consumed tokens: 904216576 | elapsed time per iteration (ms): 109751.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.536775E+00 | loss scale: 262144.0 | grad norm: 76298.706 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3060/ 292968 | consumed samples: 6266880 | consumed tokens: 904691712 | elapsed time per iteration (ms): 113169.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.548581E+00 | loss scale: 262144.0 | grad norm: 99249.908 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3061/ 292968 | consumed samples: 6268928 | consumed tokens: 905166848 | elapsed time per iteration (ms): 116442.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.555899E+00 | loss scale: 262144.0 | grad norm: 71388.401 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3062/ 292968 | consumed samples: 6270976 | consumed tokens: 905641984 | elapsed time per iteration (ms): 117107.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.565561E+00 | loss scale: 262144.0 | grad norm: 68738.748 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3063/ 292968 | consumed samples: 6273024 | consumed tokens: 906117120 | elapsed time per iteration (ms): 117329.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.560963E+00 | loss scale: 262144.0 | grad norm: 73663.725 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3064/ 292968 | consumed samples: 6275072 | consumed tokens: 906592256 | elapsed time per iteration (ms): 118862.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.549268E+00 | loss scale: 262144.0 | grad norm: 88401.276 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3065/ 292968 | consumed samples: 6277120 | consumed tokens: 907067392 | elapsed time per iteration (ms): 113132.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.595171E+00 | loss scale: 262144.0 | grad norm: 78197.833 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3066/ 292968 | consumed samples: 6279168 | consumed tokens: 907542528 | elapsed time per iteration (ms): 111699.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.567431E+00 | loss scale: 262144.0 | grad norm: 85428.499 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3067/ 292968 | consumed samples: 6281216 | consumed tokens: 908017664 | elapsed time per iteration (ms): 110830.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.531442E+00 | loss scale: 262144.0 | grad norm: 78532.872 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3068/ 292968 | consumed samples: 6283264 | consumed tokens: 908492800 | elapsed time per iteration (ms): 109977.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.549931E+00 | loss scale: 262144.0 | grad norm: 64098.801 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3069/ 292968 | consumed samples: 6285312 | consumed tokens: 908967936 | elapsed time per iteration (ms): 111572.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.527401E+00 | loss scale: 262144.0 | grad norm: 82375.271 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3070/ 292968 | consumed samples: 6287360 | consumed tokens: 909443072 | elapsed time per iteration (ms): 109723.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.514494E+00 | loss scale: 262144.0 | grad norm: 94981.184 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3071/ 292968 | consumed samples: 6289408 | consumed tokens: 909918208 | elapsed time per iteration (ms): 110492.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.559234E+00 | loss scale: 262144.0 | grad norm: 90862.339 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3072/ 292968 | consumed samples: 6291456 | consumed tokens: 910393344 | elapsed time per iteration (ms): 110800.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.554221E+00 | loss scale: 262144.0 | grad norm: 91982.438 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3073/ 292968 | consumed samples: 6293504 | consumed tokens: 910868480 | elapsed time per iteration (ms): 111850.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.566242E+00 | loss scale: 262144.0 | grad norm: 81075.446 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3074/ 292968 | consumed samples: 6295552 | consumed tokens: 911343616 | elapsed time per iteration (ms): 111783.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.556693E+00 | loss scale: 262144.0 | grad norm: 75327.724 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3075/ 292968 | consumed samples: 6297600 | consumed tokens: 911818752 | elapsed time per iteration (ms): 112802.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.532203E+00 | loss scale: 262144.0 | grad norm: 77808.092 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3076/ 292968 | consumed samples: 6299648 | consumed tokens: 912293888 | elapsed time per iteration (ms): 111886.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.536798E+00 | loss scale: 262144.0 | grad norm: 69181.190 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3077/ 292968 | consumed samples: 6301696 | consumed tokens: 912769024 | elapsed time per iteration (ms): 109788.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.529448E+00 | loss scale: 262144.0 | grad norm: 98417.458 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3078/ 292968 | consumed samples: 6303744 | consumed tokens: 913244160 | elapsed time per iteration (ms): 109949.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.558207E+00 | loss scale: 262144.0 | grad norm: 144977.392 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3079/ 292968 | consumed samples: 6305792 | consumed tokens: 913719296 | elapsed time per iteration (ms): 111355.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.542839E+00 | loss scale: 262144.0 | grad norm: 144887.374 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3080/ 292968 | consumed samples: 6307840 | consumed tokens: 914194432 | elapsed time per iteration (ms): 110023.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.568634E+00 | loss scale: 262144.0 | grad norm: 92941.350 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3081/ 292968 | consumed samples: 6309888 | consumed tokens: 914669568 | elapsed time per iteration (ms): 112256.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.564021E+00 | loss scale: 262144.0 | grad norm: 92941.350 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3082/ 292968 | consumed samples: 6311936 | consumed tokens: 915144704 | elapsed time per iteration (ms): 112050.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.573154E+00 | loss scale: 131072.0 | grad norm: 92941.350 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3083/ 292968 | consumed samples: 6313984 | consumed tokens: 915619840 | elapsed time per iteration (ms): 110067.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.587767E+00 | loss scale: 131072.0 | grad norm: 111957.591 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3084/ 292968 | consumed samples: 6316032 | consumed tokens: 916094976 | elapsed time per iteration (ms): 110023.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.549948E+00 | loss scale: 131072.0 | grad norm: 97896.334 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3085/ 292968 | consumed samples: 6318080 | consumed tokens: 916570112 | elapsed time per iteration (ms): 112215.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.590836E+00 | loss scale: 131072.0 | grad norm: 69343.441 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3086/ 292968 | consumed samples: 6320128 | consumed tokens: 917045248 | elapsed time per iteration (ms): 109520.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.571056E+00 | loss scale: 131072.0 | grad norm: 63993.728 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3087/ 292968 | consumed samples: 6322176 | consumed tokens: 917520384 | elapsed time per iteration (ms): 110110.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.561241E+00 | loss scale: 131072.0 | grad norm: 59094.102 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3088/ 292968 | consumed samples: 6324224 | consumed tokens: 917995520 | elapsed time per iteration (ms): 110599.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.568187E+00 | loss scale: 131072.0 | grad norm: 48906.726 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3089/ 292968 | consumed samples: 6326272 | consumed tokens: 918470656 | elapsed time per iteration (ms): 111035.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.589565E+00 | loss scale: 131072.0 | grad norm: 49687.770 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3090/ 292968 | consumed samples: 6328320 | consumed tokens: 918945792 | elapsed time per iteration (ms): 110708.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.530910E+00 | loss scale: 131072.0 | grad norm: 40254.463 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3091/ 292968 | consumed samples: 6330368 | consumed tokens: 919420928 | elapsed time per iteration (ms): 110329.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.551384E+00 | loss scale: 131072.0 | grad norm: 42286.336 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3092/ 292968 | consumed samples: 6332416 | consumed tokens: 919896064 | elapsed time per iteration (ms): 111680.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.581002E+00 | loss scale: 131072.0 | grad norm: 33542.876 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3093/ 292968 | consumed samples: 6334464 | consumed tokens: 920371200 | elapsed time per iteration (ms): 111080.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.548149E+00 | loss scale: 131072.0 | grad norm: 37645.822 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3094/ 292968 | consumed samples: 6336512 | consumed tokens: 920846336 | elapsed time per iteration (ms): 114983.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.561650E+00 | loss scale: 131072.0 | grad norm: 45264.420 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3095/ 292968 | consumed samples: 6338560 | consumed tokens: 921321472 | elapsed time per iteration (ms): 112957.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.530528E+00 | loss scale: 131072.0 | grad norm: 59561.033 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3096/ 292968 | consumed samples: 6340608 | consumed tokens: 921796608 | elapsed time per iteration (ms): 111460.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.529258E+00 | loss scale: 131072.0 | grad norm: 36811.120 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3097/ 292968 | consumed samples: 6342656 | consumed tokens: 922271744 | elapsed time per iteration (ms): 111452.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.556875E+00 | loss scale: 131072.0 | grad norm: 31223.968 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3098/ 292968 | consumed samples: 6344704 | consumed tokens: 922746880 | elapsed time per iteration (ms): 110963.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.560329E+00 | loss scale: 131072.0 | grad norm: 43357.196 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3099/ 292968 | consumed samples: 6346752 | consumed tokens: 923222016 | elapsed time per iteration (ms): 109338.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.528105E+00 | loss scale: 131072.0 | grad norm: 55024.526 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3100/ 292968 | consumed samples: 6348800 | consumed tokens: 923697152 | elapsed time per iteration (ms): 109711.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.541577E+00 | loss scale: 131072.0 | grad norm: 37820.128 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3101/ 292968 | consumed samples: 6350848 | consumed tokens: 924172288 | elapsed time per iteration (ms): 109829.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.521191E+00 | loss scale: 131072.0 | grad norm: 38476.139 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3102/ 292968 | consumed samples: 6352896 | consumed tokens: 924647424 | elapsed time per iteration (ms): 109162.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.541833E+00 | loss scale: 131072.0 | grad norm: 42999.338 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3103/ 292968 | consumed samples: 6354944 | consumed tokens: 925122560 | elapsed time per iteration (ms): 109209.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.527602E+00 | loss scale: 131072.0 | grad norm: 41025.070 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3104/ 292968 | consumed samples: 6356992 | consumed tokens: 925597696 | elapsed time per iteration (ms): 108775.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.519634E+00 | loss scale: 131072.0 | grad norm: 31961.766 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3105/ 292968 | consumed samples: 6359040 | consumed tokens: 926072832 | elapsed time per iteration (ms): 109808.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.545305E+00 | loss scale: 131072.0 | grad norm: 41148.398 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3106/ 292968 | consumed samples: 6361088 | consumed tokens: 926547968 | elapsed time per iteration (ms): 109343.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.534677E+00 | loss scale: 131072.0 | grad norm: 33930.951 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3107/ 292968 | consumed samples: 6363136 | consumed tokens: 927023104 | elapsed time per iteration (ms): 110093.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.565519E+00 | loss scale: 131072.0 | grad norm: 33777.532 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3108/ 292968 | consumed samples: 6365184 | consumed tokens: 927498240 | elapsed time per iteration (ms): 110410.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.537144E+00 | loss scale: 131072.0 | grad norm: 40416.615 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3109/ 292968 | consumed samples: 6367232 | consumed tokens: 927973376 | elapsed time per iteration (ms): 111377.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.527619E+00 | loss scale: 131072.0 | grad norm: 46857.043 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3110/ 292968 | consumed samples: 6369280 | consumed tokens: 928448512 | elapsed time per iteration (ms): 110496.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.555134E+00 | loss scale: 131072.0 | grad norm: 60420.377 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3111/ 292968 | consumed samples: 6371328 | consumed tokens: 928923648 | elapsed time per iteration (ms): 111479.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.542067E+00 | loss scale: 131072.0 | grad norm: 47053.293 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3112/ 292968 | consumed samples: 6373376 | consumed tokens: 929398784 | elapsed time per iteration (ms): 109535.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.538254E+00 | loss scale: 131072.0 | grad norm: 41897.336 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3113/ 292968 | consumed samples: 6375424 | consumed tokens: 929873920 | elapsed time per iteration (ms): 112139.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.540478E+00 | loss scale: 131072.0 | grad norm: 43233.715 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3114/ 292968 | consumed samples: 6377472 | consumed tokens: 930349056 | elapsed time per iteration (ms): 109783.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.507294E+00 | loss scale: 131072.0 | grad norm: 38971.265 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3115/ 292968 | consumed samples: 6379520 | consumed tokens: 930824192 | elapsed time per iteration (ms): 109544.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.527198E+00 | loss scale: 131072.0 | grad norm: 39431.657 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3116/ 292968 | consumed samples: 6381568 | consumed tokens: 931299328 | elapsed time per iteration (ms): 109791.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.543795E+00 | loss scale: 131072.0 | grad norm: 35911.906 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3117/ 292968 | consumed samples: 6383616 | consumed tokens: 931774464 | elapsed time per iteration (ms): 109068.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.553530E+00 | loss scale: 131072.0 | grad norm: 31794.593 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3118/ 292968 | consumed samples: 6385664 | consumed tokens: 932249600 | elapsed time per iteration (ms): 111130.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.521324E+00 | loss scale: 131072.0 | grad norm: 37780.759 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3119/ 292968 | consumed samples: 6387712 | consumed tokens: 932724736 | elapsed time per iteration (ms): 110038.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.533055E+00 | loss scale: 131072.0 | grad norm: 36496.675 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3120/ 292968 | consumed samples: 6389760 | consumed tokens: 933199872 | elapsed time per iteration (ms): 110677.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.529382E+00 | loss scale: 131072.0 | grad norm: 35531.822 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3121/ 292968 | consumed samples: 6391808 | consumed tokens: 933675008 | elapsed time per iteration (ms): 111492.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.548238E+00 | loss scale: 131072.0 | grad norm: 44060.029 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3122/ 292968 | consumed samples: 6393856 | consumed tokens: 934150144 | elapsed time per iteration (ms): 110377.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.546454E+00 | loss scale: 131072.0 | grad norm: 50136.311 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3123/ 292968 | consumed samples: 6395904 | consumed tokens: 934625280 | elapsed time per iteration (ms): 110624.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.544405E+00 | loss scale: 131072.0 | grad norm: 58389.993 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3124/ 292968 | consumed samples: 6397952 | consumed tokens: 935100416 | elapsed time per iteration (ms): 109969.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.526251E+00 | loss scale: 131072.0 | grad norm: 46223.258 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3125/ 292968 | consumed samples: 6400000 | consumed tokens: 935575552 | elapsed time per iteration (ms): 110453.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.521111E+00 | loss scale: 131072.0 | grad norm: 47758.541 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3126/ 292968 | consumed samples: 6402048 | consumed tokens: 936050688 | elapsed time per iteration (ms): 110497.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.529240E+00 | loss scale: 131072.0 | grad norm: 43012.626 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3127/ 292968 | consumed samples: 6404096 | consumed tokens: 936525824 | elapsed time per iteration (ms): 109904.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.557906E+00 | loss scale: 131072.0 | grad norm: 38612.784 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3128/ 292968 | consumed samples: 6406144 | consumed tokens: 937000960 | elapsed time per iteration (ms): 108913.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.531578E+00 | loss scale: 131072.0 | grad norm: 41133.483 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3129/ 292968 | consumed samples: 6408192 | consumed tokens: 937476096 | elapsed time per iteration (ms): 110361.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.493152E+00 | loss scale: 131072.0 | grad norm: 37207.043 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3130/ 292968 | consumed samples: 6410240 | consumed tokens: 937951232 | elapsed time per iteration (ms): 110191.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.501123E+00 | loss scale: 131072.0 | grad norm: 38164.113 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3131/ 292968 | consumed samples: 6412288 | consumed tokens: 938426368 | elapsed time per iteration (ms): 111607.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.532626E+00 | loss scale: 131072.0 | grad norm: 33127.285 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3132/ 292968 | consumed samples: 6414336 | consumed tokens: 938901504 | elapsed time per iteration (ms): 109170.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.537379E+00 | loss scale: 131072.0 | grad norm: 34759.150 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3133/ 292968 | consumed samples: 6416384 | consumed tokens: 939376640 | elapsed time per iteration (ms): 110333.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.516551E+00 | loss scale: 131072.0 | grad norm: 34224.005 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3134/ 292968 | consumed samples: 6418432 | consumed tokens: 939851776 | elapsed time per iteration (ms): 111757.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.518162E+00 | loss scale: 131072.0 | grad norm: 36712.897 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3135/ 292968 | consumed samples: 6420480 | consumed tokens: 940326912 | elapsed time per iteration (ms): 110838.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.531169E+00 | loss scale: 131072.0 | grad norm: 45871.157 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3136/ 292968 | consumed samples: 6422528 | consumed tokens: 940802048 | elapsed time per iteration (ms): 111400.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.530567E+00 | loss scale: 131072.0 | grad norm: 40914.375 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3137/ 292968 | consumed samples: 6424576 | consumed tokens: 941277184 | elapsed time per iteration (ms): 110125.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.528954E+00 | loss scale: 131072.0 | grad norm: 46189.970 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3138/ 292968 | consumed samples: 6426624 | consumed tokens: 941752320 | elapsed time per iteration (ms): 110514.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.515621E+00 | loss scale: 131072.0 | grad norm: 47886.409 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3139/ 292968 | consumed samples: 6428672 | consumed tokens: 942227456 | elapsed time per iteration (ms): 111383.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.544991E+00 | loss scale: 131072.0 | grad norm: 44561.397 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3140/ 292968 | consumed samples: 6430720 | consumed tokens: 942702592 | elapsed time per iteration (ms): 109945.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.550672E+00 | loss scale: 131072.0 | grad norm: 55870.403 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3141/ 292968 | consumed samples: 6432768 | consumed tokens: 943177728 | elapsed time per iteration (ms): 111833.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.535420E+00 | loss scale: 131072.0 | grad norm: 54687.584 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3142/ 292968 | consumed samples: 6434816 | consumed tokens: 943652864 | elapsed time per iteration (ms): 109935.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.554422E+00 | loss scale: 131072.0 | grad norm: 46354.847 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3143/ 292968 | consumed samples: 6436864 | consumed tokens: 944128000 | elapsed time per iteration (ms): 110450.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.515105E+00 | loss scale: 131072.0 | grad norm: 42457.256 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3144/ 292968 | consumed samples: 6438912 | consumed tokens: 944603136 | elapsed time per iteration (ms): 110392.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.491606E+00 | loss scale: 131072.0 | grad norm: 47675.537 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3145/ 292968 | consumed samples: 6440960 | consumed tokens: 945078272 | elapsed time per iteration (ms): 110165.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.545086E+00 | loss scale: 131072.0 | grad norm: 40437.099 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3146/ 292968 | consumed samples: 6443008 | consumed tokens: 945553408 | elapsed time per iteration (ms): 109112.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.519203E+00 | loss scale: 131072.0 | grad norm: 40121.803 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3147/ 292968 | consumed samples: 6445056 | consumed tokens: 946028544 | elapsed time per iteration (ms): 109992.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.507916E+00 | loss scale: 131072.0 | grad norm: 39602.549 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3148/ 292968 | consumed samples: 6447104 | consumed tokens: 946503680 | elapsed time per iteration (ms): 110837.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.501790E+00 | loss scale: 131072.0 | grad norm: 37185.032 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3149/ 292968 | consumed samples: 6449152 | consumed tokens: 946978816 | elapsed time per iteration (ms): 109989.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.528901E+00 | loss scale: 131072.0 | grad norm: 44056.823 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3150/ 292968 | consumed samples: 6451200 | consumed tokens: 947453952 | elapsed time per iteration (ms): 110689.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.500736E+00 | loss scale: 131072.0 | grad norm: 34733.114 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) ------------------------------------------------------------------------------------------------- - validation loss at iteration 3150 | lm loss value: 3.517273E+00 | lm loss PPL: 3.369244E+01 | ------------------------------------------------------------------------------------------------- - iteration 3151/ 292968 | consumed samples: 6453248 | consumed tokens: 947929088 | elapsed time per iteration (ms): 289717.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.513340E+00 | loss scale: 131072.0 | grad norm: 35613.642 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3152/ 292968 | consumed samples: 6455296 | consumed tokens: 948404224 | elapsed time per iteration (ms): 109415.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.519228E+00 | loss scale: 131072.0 | grad norm: 46331.769 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3153/ 292968 | consumed samples: 6457344 | consumed tokens: 948879360 | elapsed time per iteration (ms): 108618.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.528655E+00 | loss scale: 131072.0 | grad norm: 62191.264 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3154/ 292968 | consumed samples: 6459392 | consumed tokens: 949354496 | elapsed time per iteration (ms): 109050.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.531178E+00 | loss scale: 131072.0 | grad norm: 55588.878 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3155/ 292968 | consumed samples: 6461440 | consumed tokens: 949829632 | elapsed time per iteration (ms): 111657.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.522779E+00 | loss scale: 131072.0 | grad norm: 44837.393 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3156/ 292968 | consumed samples: 6463488 | consumed tokens: 950304768 | elapsed time per iteration (ms): 110189.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.523057E+00 | loss scale: 131072.0 | grad norm: 43731.420 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3157/ 292968 | consumed samples: 6465536 | consumed tokens: 950779904 | elapsed time per iteration (ms): 110493.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.496690E+00 | loss scale: 131072.0 | grad norm: 46192.470 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3158/ 292968 | consumed samples: 6467584 | consumed tokens: 951255040 | elapsed time per iteration (ms): 109909.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.517199E+00 | loss scale: 131072.0 | grad norm: 31717.912 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3159/ 292968 | consumed samples: 6469632 | consumed tokens: 951730176 | elapsed time per iteration (ms): 110040.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.518413E+00 | loss scale: 131072.0 | grad norm: 40340.483 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3160/ 292968 | consumed samples: 6471680 | consumed tokens: 952205312 | elapsed time per iteration (ms): 111087.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.519091E+00 | loss scale: 131072.0 | grad norm: 32898.784 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3161/ 292968 | consumed samples: 6473728 | consumed tokens: 952680448 | elapsed time per iteration (ms): 109338.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.527358E+00 | loss scale: 131072.0 | grad norm: 34774.966 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3162/ 292968 | consumed samples: 6475776 | consumed tokens: 953155584 | elapsed time per iteration (ms): 108656.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.513849E+00 | loss scale: 131072.0 | grad norm: 39540.117 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3163/ 292968 | consumed samples: 6477824 | consumed tokens: 953630720 | elapsed time per iteration (ms): 109547.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.511124E+00 | loss scale: 131072.0 | grad norm: 48375.830 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3164/ 292968 | consumed samples: 6479872 | consumed tokens: 954105856 | elapsed time per iteration (ms): 113586.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.508611E+00 | loss scale: 131072.0 | grad norm: 52037.682 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3165/ 292968 | consumed samples: 6481920 | consumed tokens: 954580992 | elapsed time per iteration (ms): 114860.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.541578E+00 | loss scale: 131072.0 | grad norm: 41480.973 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3166/ 292968 | consumed samples: 6483968 | consumed tokens: 955056128 | elapsed time per iteration (ms): 121137.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.516208E+00 | loss scale: 131072.0 | grad norm: 41301.397 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3167/ 292968 | consumed samples: 6486016 | consumed tokens: 955531264 | elapsed time per iteration (ms): 110110.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.504046E+00 | loss scale: 131072.0 | grad norm: 47013.136 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3168/ 292968 | consumed samples: 6488064 | consumed tokens: 956006400 | elapsed time per iteration (ms): 110799.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.523125E+00 | loss scale: 131072.0 | grad norm: 53442.123 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3169/ 292968 | consumed samples: 6490112 | consumed tokens: 956481536 | elapsed time per iteration (ms): 109797.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.518640E+00 | loss scale: 131072.0 | grad norm: 44658.960 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3170/ 292968 | consumed samples: 6492160 | consumed tokens: 956956672 | elapsed time per iteration (ms): 109397.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.506108E+00 | loss scale: 131072.0 | grad norm: 37584.401 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3171/ 292968 | consumed samples: 6494208 | consumed tokens: 957431808 | elapsed time per iteration (ms): 110495.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.506019E+00 | loss scale: 131072.0 | grad norm: 33635.084 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3172/ 292968 | consumed samples: 6496256 | consumed tokens: 957906944 | elapsed time per iteration (ms): 109863.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.563114E+00 | loss scale: 131072.0 | grad norm: 40481.486 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3173/ 292968 | consumed samples: 6498304 | consumed tokens: 958382080 | elapsed time per iteration (ms): 110984.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.540124E+00 | loss scale: 131072.0 | grad norm: 48680.218 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3174/ 292968 | consumed samples: 6500352 | consumed tokens: 958857216 | elapsed time per iteration (ms): 109893.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.520960E+00 | loss scale: 131072.0 | grad norm: 50210.268 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3175/ 292968 | consumed samples: 6502400 | consumed tokens: 959332352 | elapsed time per iteration (ms): 111336.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.507021E+00 | loss scale: 131072.0 | grad norm: 40907.580 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3176/ 292968 | consumed samples: 6504448 | consumed tokens: 959807488 | elapsed time per iteration (ms): 109713.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.527415E+00 | loss scale: 131072.0 | grad norm: 47008.511 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3177/ 292968 | consumed samples: 6506496 | consumed tokens: 960282624 | elapsed time per iteration (ms): 110170.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.507471E+00 | loss scale: 131072.0 | grad norm: 47137.325 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3178/ 292968 | consumed samples: 6508544 | consumed tokens: 960757760 | elapsed time per iteration (ms): 110343.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.498612E+00 | loss scale: 131072.0 | grad norm: 40007.189 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3179/ 292968 | consumed samples: 6510592 | consumed tokens: 961232896 | elapsed time per iteration (ms): 110775.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.562264E+00 | loss scale: 131072.0 | grad norm: 36207.314 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3180/ 292968 | consumed samples: 6512640 | consumed tokens: 961708032 | elapsed time per iteration (ms): 110885.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.538762E+00 | loss scale: 131072.0 | grad norm: 46057.776 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3181/ 292968 | consumed samples: 6514688 | consumed tokens: 962183168 | elapsed time per iteration (ms): 110780.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.511930E+00 | loss scale: 131072.0 | grad norm: 57056.263 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3182/ 292968 | consumed samples: 6516736 | consumed tokens: 962658304 | elapsed time per iteration (ms): 111044.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.529408E+00 | loss scale: 131072.0 | grad norm: 53534.245 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3183/ 292968 | consumed samples: 6518784 | consumed tokens: 963133440 | elapsed time per iteration (ms): 110096.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.501729E+00 | loss scale: 131072.0 | grad norm: 42867.291 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3184/ 292968 | consumed samples: 6520832 | consumed tokens: 963608576 | elapsed time per iteration (ms): 107449.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.522443E+00 | loss scale: 131072.0 | grad norm: 52216.063 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3185/ 292968 | consumed samples: 6522880 | consumed tokens: 964083712 | elapsed time per iteration (ms): 110918.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.526424E+00 | loss scale: 131072.0 | grad norm: 47640.665 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3186/ 292968 | consumed samples: 6524928 | consumed tokens: 964558848 | elapsed time per iteration (ms): 107939.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.543110E+00 | loss scale: 131072.0 | grad norm: 28613.723 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -saving checkpoint at iteration 3186 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -[2021-10-30 09:43:03,725] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/mp_rank_00_model_states.pt -[2021-10-30 09:43:05,523] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/mp_rank_01_model_states.pt -[2021-10-30 09:43:18,251] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_120_optim_states.pt -[2021-10-30 09:43:18,276] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_38_optim_states.pt -[2021-10-30 09:43:18,328] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_57_optim_states.pt -[2021-10-30 09:43:18,330] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_45_optim_states.pt -[2021-10-30 09:43:18,444] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_22_optim_states.pt -[2021-10-30 09:43:18,487] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_59_optim_states.pt -[2021-10-30 09:43:18,498] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_87_optim_states.pt -[2021-10-30 09:43:18,507] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_123_optim_states.pt -[2021-10-30 09:43:18,512] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_116_optim_states.pt -[2021-10-30 09:43:18,512] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_82_optim_states.pt -[2021-10-30 09:43:18,553] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_47_optim_states.pt -[2021-10-30 09:43:18,594] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_86_optim_states.pt -[2021-10-30 09:43:18,617] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_16_optim_states.pt -[2021-10-30 09:43:18,627] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_89_optim_states.pt -[2021-10-30 09:43:18,629] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_53_optim_states.pt -[2021-10-30 09:43:18,630] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_50_optim_states.pt -[2021-10-30 09:43:18,634] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_30_optim_states.pt -[2021-10-30 09:43:18,656] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_109_optim_states.pt -[2021-10-30 09:43:18,662] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_76_optim_states.pt -[2021-10-30 09:43:18,695] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_52_optim_states.pt -[2021-10-30 09:43:18,716] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_92_optim_states.pt -[2021-10-30 09:43:18,730] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_18_optim_states.pt -[2021-10-30 09:43:18,739] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_75_optim_states.pt -[2021-10-30 09:43:18,760] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_74_optim_states.pt -[2021-10-30 09:43:18,770] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_36_optim_states.pt -[2021-10-30 09:43:18,830] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_70_optim_states.pt -[2021-10-30 09:43:18,933] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_115_optim_states.pt -[2021-10-30 09:43:18,943] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_100_optim_states.pt -[2021-10-30 09:43:18,943] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_66_optim_states.pt -[2021-10-30 09:43:18,960] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_67_optim_states.pt -[2021-10-30 09:43:18,973] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_101_optim_states.pt -[2021-10-30 09:43:19,025] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_77_optim_states.pt -[2021-10-30 09:43:19,025] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_81_optim_states.pt -[2021-10-30 09:43:19,068] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_110_optim_states.pt -[2021-10-30 09:43:19,068] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_69_optim_states.pt -[2021-10-30 09:43:19,077] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_60_optim_states.pt -[2021-10-30 09:43:19,079] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_94_optim_states.pt -[2021-10-30 09:43:19,141] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_96_optim_states.pt -[2021-10-30 09:43:19,173] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_97_optim_states.pt -[2021-10-30 09:43:19,247] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_113_optim_states.pt -[2021-10-30 09:43:19,339] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_49_optim_states.pt -[2021-10-30 09:43:19,450] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_85_optim_states.pt -[2021-10-30 09:43:19,454] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_62_optim_states.pt -[2021-10-30 09:43:19,480] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_84_optim_states.pt -[2021-10-30 09:43:19,503] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_37_optim_states.pt -[2021-10-30 09:43:19,506] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_41_optim_states.pt -[2021-10-30 09:43:19,511] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_88_optim_states.pt -[2021-10-30 09:43:19,535] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_68_optim_states.pt -[2021-10-30 09:43:19,562] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_91_optim_states.pt -[2021-10-30 09:43:19,575] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_55_optim_states.pt -[2021-10-30 09:43:19,599] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_105_optim_states.pt -[2021-10-30 09:43:19,607] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_95_optim_states.pt -[2021-10-30 09:43:19,609] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_26_optim_states.pt -[2021-10-30 09:43:19,615] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_80_optim_states.pt -[2021-10-30 09:43:19,630] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_121_optim_states.pt -[2021-10-30 09:43:19,636] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_19_optim_states.pt -[2021-10-30 09:43:19,647] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_15_optim_states.pt -[2021-10-30 09:43:19,650] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_112_optim_states.pt -[2021-10-30 09:43:19,650] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_111_optim_states.pt -[2021-10-30 09:43:19,652] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_48_optim_states.pt -[2021-10-30 09:43:19,656] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_103_optim_states.pt -[2021-10-30 09:43:19,664] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_78_optim_states.pt -[2021-10-30 09:43:19,666] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_104_optim_states.pt -[2021-10-30 09:43:19,671] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_54_optim_states.pt -[2021-10-30 09:43:19,680] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_14_optim_states.pt -[2021-10-30 09:43:19,682] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_102_optim_states.pt -[2021-10-30 09:43:19,717] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_90_optim_states.pt -[2021-10-30 09:43:19,721] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_43_optim_states.pt -[2021-10-30 09:43:19,722] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_39_optim_states.pt -[2021-10-30 09:43:19,747] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_17_optim_states.pt -[2021-10-30 09:43:19,753] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_114_optim_states.pt -[2021-10-30 09:43:19,793] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_83_optim_states.pt -[2021-10-30 09:43:19,795] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_79_optim_states.pt -[2021-10-30 09:43:19,857] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_51_optim_states.pt -[2021-10-30 09:43:19,870] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_32_optim_states.pt -[2021-10-30 09:43:19,881] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_108_optim_states.pt -[2021-10-30 09:43:19,890] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_93_optim_states.pt -[2021-10-30 09:43:19,901] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_98_optim_states.pt -[2021-10-30 09:43:19,933] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_122_optim_states.pt -[2021-10-30 09:43:19,959] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_71_optim_states.pt -[2021-10-30 09:43:19,975] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_61_optim_states.pt -[2021-10-30 09:43:19,984] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_99_optim_states.pt -[2021-10-30 09:43:20,002] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_72_optim_states.pt -[2021-10-30 09:43:20,022] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_65_optim_states.pt -[2021-10-30 09:43:20,037] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_73_optim_states.pt -[2021-10-30 09:43:20,081] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_64_optim_states.pt -[2021-10-30 09:43:20,098] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_08_optim_states.pt -[2021-10-30 09:43:20,162] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_106_optim_states.pt -[2021-10-30 09:43:20,226] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_63_optim_states.pt -[2021-10-30 09:43:20,304] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_34_optim_states.pt -[2021-10-30 09:43:20,425] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_28_optim_states.pt -[2021-10-30 09:43:20,492] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_107_optim_states.pt -[2021-10-30 09:43:20,819] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_11_optim_states.pt -[2021-10-30 09:43:20,997] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_58_optim_states.pt -[2021-10-30 09:43:21,197] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_21_optim_states.pt -[2021-10-30 09:43:21,475] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_27_optim_states.pt -[2021-10-30 09:43:21,735] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_07_optim_states.pt -[2021-10-30 09:43:21,838] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_56_optim_states.pt -[2021-10-30 09:43:22,451] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_20_optim_states.pt -[2021-10-30 09:43:22,574] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_119_optim_states.pt -[2021-10-30 09:43:22,601] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_04_optim_states.pt -[2021-10-30 09:43:23,091] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_118_optim_states.pt -[2021-10-30 09:43:23,406] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_42_optim_states.pt -[2021-10-30 09:43:23,523] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_24_optim_states.pt -[2021-10-30 09:43:23,989] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_05_optim_states.pt -[2021-10-30 09:43:24,276] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_40_optim_states.pt -[2021-10-30 09:43:24,339] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_06_optim_states.pt -[2021-10-30 09:43:24,791] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_10_optim_states.pt -[2021-10-30 09:43:24,821] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_00_optim_states.pt -[2021-10-30 09:43:24,840] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_124_optim_states.pt -[2021-10-30 09:43:25,064] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_25_optim_states.pt -[2021-10-30 09:43:25,362] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_31_optim_states.pt -[2021-10-30 09:43:25,612] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_02_optim_states.pt -[2021-10-30 09:43:25,943] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_126_optim_states.pt -[2021-10-30 09:43:26,443] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_09_optim_states.pt -[2021-10-30 09:43:26,556] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_23_optim_states.pt -[2021-10-30 09:43:26,585] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_35_optim_states.pt -[2021-10-30 09:43:27,500] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_33_optim_states.pt -[2021-10-30 09:43:27,521] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_29_optim_states.pt -[2021-10-30 09:43:28,913] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_46_optim_states.pt -[2021-10-30 09:43:29,477] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_12_optim_states.pt -[2021-10-30 09:43:29,874] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_13_optim_states.pt -[2021-10-30 09:43:30,193] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_44_optim_states.pt -[2021-10-30 09:43:31,471] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_117_optim_states.pt -[2021-10-30 09:43:36,023] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_127_optim_states.pt -[2021-10-30 09:43:36,931] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_125_optim_states.pt -[2021-10-30 09:43:37,641] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_01_optim_states.pt -[2021-10-30 09:43:38,484] [INFO] [engine.py:2540:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/global_step3186/zero_pp_rank_0_mp_rank_03_optim_states.pt - successfully saved checkpoint at iteration 3186 to /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints -time (ms) | save-checkpoint: 37755.83 -[exiting program after 1190.4787764191628 minutes] datetime: 2021-10-30 09:43:38 -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op nameninja ................ installed.................. [OKAY].. -compatible ----------------------------------------------------------------------------------------------------- - -op name ................ installed .. compatible -cpu_adam-------------------------------------------------- -............... [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lambfused_adam .......................... [NO] [NO]....... .......[OKAY] -[OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer sparse_attn............ ............[NO] [NO]....... .......[OKAY] -[OKAY] -transformerstochastic_transformer ............ [NO]. .......[NO] [OKAY]....... - [OKAY] -stochastic_transformer .ninja ninja [NO].................. .........................[OKAY] -[OKAY][OKAY]-------------------------------------------------- - - ---------------------------------------------------op name - ................op name installed................ .. installedcompatible -.. --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam ............... cpu_adam[NO] ...................... [NO][OKAY] -....... [OKAY] -fused_adam ............. [NO]fused_adam .................... [OKAY][NO] - .......fused_lamb [OKAY]............. - [NO] ....... fused_lamb[OKAY] -............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY]sparse_attn - ............ transformer[NO] ................... [NO][OKAY] -....... [OKAY] -transformer ............stochastic_transformer [NO] ........ [NO][OKAY] -....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -ninja .................. [OKAY] -cpu_adam-------------------------------------------------- ............... - [NO]op name ....................... [OKAY]installed - .. compatible --------------------------------------------------- -fused_adam ............. [NO] .......cpu_adam [OKAY]............... -[NO] .......fused_lamb .............[OKAY] -[NO] .......ninja ninja[OKAY] -.................. ..................[OKAY] -fused_adam[OKAY] -------------------------------------------------- - -.............-------------------------------------------------- op name - [NO]sparse_attnop name................ ................................... installed [OKAY] [NO]installed -.. ....... ..compatiblefused_lamb[OKAY] - - --------------------------------------------------compatible............. - -transformer --------------------------------------------------[NO]............ - .......[NO]cpu_adam ......................[OKAY] -cpu_adam[OKAY][NO] - ...................... stochastic_transformer[OKAY][NO] - ........sparse_attn [NO]............[OKAY] -.......[NO] fused_adam [OKAY] ....... -............. [OKAY][NO] -fused_adam....... transformer.............[OKAY] -............[NO] [NO]....... .......fused_lamb[OKAY] .............[OKAY] - -[NO] ....... fused_lambstochastic_transformer[OKAY] -.............. [NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn ............transformer [NO]............ [NO]....... ....... [OKAY][OKAY] - -transformerstochastic_transformer ............ .[NO] [NO]....... ....... [OKAY][OKAY] - -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninjafused_adam ............................... [OKAY][NO] -....... --------------------------------------------------[OKAY] - -op name ................fused_lamb installed............. ..[NO] compatible....... - [OKAY]-------------------------------------------------- - -cpu_adam ............... [NO] ....... [OKAY]sparse_attn - ............ [NO] ....... [OKAY] -transformer ............ fused_adam[NO] .................... [NO][OKAY] -....... [OKAY] -stochastic_transformer fused_lamb. .............[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] .......ninja [OKAY] -.................. [OKAY]transformer -............ --------------------------------------------------[NO] - .......op name [OKAY]................ - installed .. stochastic_transformercompatible --------------------------------------------------- -. [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer ninja. ..................[NO] [OKAY]....... - [OKAY]-------------------------------------------------- - -op name ................ installed ninja.. compatible.................. - --------------------------------------------------[OKAY] - --------------------------------------------------- -op name ................ cpu_adaminstalled ................. compatible -[NO]-------------------------------------------------- -....... [OKAY] -cpu_adam ............... [NO] ....... fused_adam[OKAY] -............. [NO] ....... [OKAY] -fused_lambfused_adam ............. [NO]............. .......[NO] [OKAY]....... - [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformersparse_attnninja ........................ [NO][NO] .............. [OKAY][OKAY] -.................. - [OKAY]stochastic_transformertransformer - ............-------------------------------------------------- ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system -.[NO]op name .......[NO] ................ [OKAY] -....... [OKAY] -installedstochastic_transformer .. compatible. - --------------------------------------------------[NO] - ....... [OKAY] - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja ninja.................. [OKAY]..................ninja -[OKAY] ..................-------------------------------------------------- - -[OKAY]-------------------------------------------------- -op name - --------------------------------------------------op name................ - ................installedop name installed.. .................. compatible -installedcompatible -------------------------------------------------- -.. - --------------------------------------------------compatible - --------------------------------------------------- -cpu_adam ...............cpu_adam [NO]cpu_adam............... [NO]...................... .......[NO][OKAY] [OKAY] - -....... [OKAY] -fused_adamfused_adam fused_adam.......................... [NO][NO]............. ....... ....... [NO] [OKAY] - [OKAY]....... - [OKAY]fused_lamb -fused_lamb ............. .............fused_lamb[NO] .............[NO]....... [NO][OKAY]....... - .......[OKAY] -[OKAY] -sparse_attn ............sparse_attnsparse_attn [NO]........................ .......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -transformer ............transformer transformer [NO] ............ ................... [NO][NO][OKAY] -.............. [OKAY][OKAY] -stochastic_transformer - . stochastic_transformerstochastic_transformer[NO] ........ . [OKAY] -[NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................. ..................[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -op name op name................ installed................ ..installed compatible.. - compatible-------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... cpu_adam[NO] ...................... [NO][OKAY] -....... [OKAY] -ninjafused_adam ............................... fused_adam[OKAY] [NO] -............. .......--------------------------------------------------[NO] - [OKAY]....... -op name [OKAY]................ - installedfused_lamb ............... fused_lambcompatible[NO] - ....................-------------------------------------------------- -[NO][OKAY] -....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY]sparse_attn - ............ sparse_attn[NO] ................... [NO][OKAY] -....... [OKAY]fused_adam -transformer .........................transformer [NO][NO]............ ..............[NO] [OKAY][OKAY]....... - - [OKAY] -fused_lamb .............stochastic_transformer [NO] stochastic_transformer....... . [OKAY].[NO] - [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninja .................. [OKAY] -sparse_attn-------------------------------------------------- -............ [NO]op name ....................... [OKAY]installed - .. compatibletransformer - ............-------------------------------------------------- -[NO] ....... [OKAY] -stochastic_transformercpu_adam ...............ninja .[NO] ..................[NO]....... [OKAY].......[OKAY] - -[OKAY]-------------------------------------------------- - -op name ................ installed .. fused_adamcompatible - .............-------------------------------------------------- [NO] - ....... [OKAY] -fused_lamb cpu_adam............. ...............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_adamsparse_attn ......................... [NO][NO] ....... .......[OKAY] -[OKAY] -ninjatransformer fused_lamb ........................................... [OKAY][NO][NO] - .......--------------------------------------------------....... - [OKAY]op name -[OKAY] ................ - installed .. stochastic_transformercompatible -.-------------------------------------------------- - sparse_attn[NO] ............ [NO].......cpu_adam ......................[OKAY] -[NO][OKAY] -....... [OKAY] -transformer ............ [NO] ....... [OKAY] -fused_adamstochastic_transformer ............. .[NO] [NO]....... .......[OKAY] -[OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO]ninja ....... [OKAY].................. - [OKAY] -transformer ............-------------------------------------------------- -[NO] op name....... [OKAY]................ - installed ..stochastic_transformer compatible -. --------------------------------------------------[NO] - ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninjasparse_attn .............................. [NO][OKAY] -.......-------------------------------------------------- -[OKAY] -op name ................ installedtransformer .............. compatible -[NO]-------------------------------------------------- -....... [OKAY] -stochastic_transformer cpu_adam ................ [NO][NO] .............. [OKAY] -[OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -ninja .................. [OKAY]stochastic_transformer - --------------------------------------------------. -[NO] .......op name [OKAY] -................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -sparse_attn op name............ ................[NO] installed....... ..[OKAY] -compatible -transformer-------------------------------------------------- -............ [NO] ....... [OKAY] -cpu_adam stochastic_transformer............... [NO] ........ [NO][OKAY] -....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] .......ninja [OKAY] -.................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatiblesparse_attn - --------------------------------------------------............ - [NO] ....... [OKAY] -transformer cpu_adam............ ...............[NO] [NO] .............. [OKAY][OKAY] - -stochastic_transformer . [NO] .......fused_adam [OKAY]............. - [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -ninja ..................stochastic_transformer [OKAY] -. --------------------------------------------------[NO] - .......op name [OKAY]ninja -................ installed.................. .. [OKAY]compatible - --------------------------------------------------- --------------------------------------------------- -op name ................ installed cpu_adam.. ............... compatible[NO] - .......-------------------------------------------------- -[OKAY] -cpu_adam ............... fused_adam[NO] .................... [NO][OKAY] -....... [OKAY] -fused_lamb ............. [NO]fused_adam ....... .............[OKAY] -[NO] ....... [OKAY] -fused_lamb ............. sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] -transformer ............ [NO] ....... [OKAY] -sparse_attn ............stochastic_transformer [NO] ........ [NO] [OKAY]....... - [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ninja....... ..................[OKAY] -[OKAY] --------------------------------------------------- -op name ................ installed ..sparse_attn compatible............ --------------------------------------------------- - [NO] ....... [OKAY] -cpu_adam ...............ninjatransformer [NO].............................. ....... [OKAY][OKAY] - -[NO]-------------------------------------------------- - op name....... [OKAY]................ - fused_adaminstalled ............... [NO]compatible -stochastic_transformer.......-------------------------------------------------- -[OKAY]. -[NO]cpu_adam .......fused_lamb............... .............[NO] [OKAY][NO] - .............. [OKAY][OKAY] - -fused_adam .............sparse_attn [NO]............ .......[NO] [OKAY]....... - [OKAY] -fused_lambtransformer ......................... [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformer . [NO] .......sparse_attn [OKAY]............ - [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja fused_adam.................. [OKAY]............. - [NO] --------------------------------------------------....... - [OKAY]op name - ................fused_lamb installed............. ..[NO] compatible....... - [OKAY]-------------------------------------------------- - -cpu_adam ............... [NO] ....... sparse_attn[OKAY] -............ [NO] ....... [OKAY] -transformer ............ fused_adam[NO] .................... [NO][OKAY] -....... [OKAY] -stochastic_transformer fused_lamb .............. [NO][NO] .............. [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.sparse_attn - ............-------------------------------------------------- -[NO] JIT compiled ops requires ninja....... - [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -fused_adamop name ............................. [NO]installed ......... [OKAY]compatible - --------------------------------------------------- -fused_lamb ............. [NO] ....... [OKAY] -cpu_adam ............... [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... fused_adam[OKAY] -............. [NO] transformer....... ............[OKAY] -[NO] ....... [OKAY]fused_lamb - ............. [NO]stochastic_transformer ....... [OKAY]. - [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninja .................................... [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -op nameop name ................................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [NO][NO] .............. [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lambninja ............. ..................[NO] [OKAY]....... - --------------------------------------------------[OKAY] - -op name ................ installed .. compatible --------------------------------------------------- -sparse_attn ............ [NO] .......cpu_adam [OKAY]............... -[NO] ....... transformer[OKAY] -............ [NO] ....... [OKAY] -stochastic_transformer fused_adam .............. [NO][NO] .............. [OKAY] -[OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninjasparse_attn .............................. [NO][OKAY] ....... - [OKAY]-------------------------------------------------- - -op name transformer................ ............installed [NO].. .......compatible -[OKAY] --------------------------------------------------- -stochastic_transformer . [NO] .......cpu_adam [OKAY]............... - [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] .......ninja [OKAY] -.................. [OKAY] --------------------------------------------------- -op name ................ installed fused_adam.. compatible............. - --------------------------------------------------[NO] - ....... [OKAY] -ninjafused_lamb cpu_adam .................. ............................ [OKAY][NO] - [NO].......-------------------------------------------------- -[OKAY]....... -op name [OKAY]................ - installed .. compatible --------------------------------------------------- -fused_adam ............. [NO] ....... [OKAY] -cpu_adamsparse_attn ...........................fused_lamb [NO][NO]............. ..............[NO] [OKAY].......[OKAY] - -[OKAY] -transformer ............ [NO] ....... [OKAY]fused_adam - ............. [NO]stochastic_transformer sparse_attn....... .............[OKAY] -[NO][NO] .............. fused_lamb [OKAY] [OKAY] -............. - [NO]transformer ................... [OKAY][NO] - ....... [OKAY] -stochastic_transformer . [NO] .......sparse_attn [OKAY] -............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -ninja .................. [OKAY] -ninja-------------------------------------------------- - sparse_attn..................op name ............[OKAY]................ -[NO] installed--------------------------------------------------....... - ..[OKAY]op name - compatible................ -transformer -------------------------------------------------- -installed............ ninja ..[NO] ..................compatible....... cpu_adam - [OKAY] ---------------------------------------------------[OKAY]............... - --------------------------------------------------- -[NO] stochastic_transformerop name....... ................cpu_adam. [OKAY] [NO]installed............... -.. .......[NO]compatible - [OKAY]-------------------------------------------------- - -.......fused_adam [OKAY]............. - [NO] ....... [OKAY]cpu_adam - ............... [NO] fused_lamb....... fused_adam.............[OKAY] -.............[NO] [NO]....... .......[OKAY] -[OKAY] -fused_adamfused_lamb ............. .............[NO] sparse_attn[NO]....... ............[OKAY]....... - [NO][OKAY] -.......fused_lamb .............[OKAY] -[NO] ....... transformer[OKAY] -sparse_attn............ ............ [NO][NO] .............. [OKAY][OKAY] - -transformer stochastic_transformer............ sparse_attn .[NO]............ [NO][NO] ....... .............. [OKAY][OKAY][OKAY] - - -stochastic_transformertransformer ............ .[NO] [NO]....... [OKAY]....... - [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ---------------------------------------------------ninja - DeepSpeed C++/CUDA extension op report.................. - [OKAY]-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- -op nameJIT compiled ops requires ninja -................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ninja....... ..................[OKAY] -[OKAY] --------------------------------------------------- -op name ................ installed fused_adam.. compatible............. - [NO]-------------------------------------------------- -....... [OKAY] -fused_lambcpu_adam ............................ [NO][NO] .............. [OKAY] -[OKAY] -fused_adamsparse_attn ............. ............[NO] .......[NO] [OKAY]....... - [OKAY] -fused_lambtransformer ......................... [NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformer . [NO] ....... [OKAY]sparse_attn - ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_ioasync_io .............................. [NO] [NO]....... .......[NO] -[NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[NO] [NO]....... .......[OKAY] -[OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum - - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] -utilsquantizer .................. ..............[NO] [NO]....... [OKAY]....... - [OKAY] -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version DeepSpeed general environment info:............... 11.1 - -nvcc version ..................... 11.2torch install path -deepspeed install path .......................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']................... -0.5.5+29bee73, 29bee73, master -torch versiondeepspeed wheel compiled w. .......................... 1.8.1torch 1.8, cuda 11.1 - -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] ....... utils[NO] -.................. [NO] ....... [OKAY] -quantizer .............. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -DeepSpeed general environment info:torch cuda version ............... -11.1 -nvcc version ..................... 11.2torch install path -deepspeed install path .......................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ...................['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] 0.5.5+29bee73, 29bee73, master - -deepspeed wheel compiled w.torch version ...... ....................torch 1.8, cuda 11.1 -1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -....... [NO] -async_iotransformer_inference ................. [NO][NO] .............. [OKAY][NO] - -utils .................. [NO] ....... [OKAY] -transformer_inferencequantizer .. ..............[NO] [NO]....... .......[OKAY] -[OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils-------------------------------------------------- -async_io ............... [NO] ....... [NO] -.................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference ..async_io [NO] ...................... [OKAY][NO] - ....... [NO] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] .......transformer_inference [OKAY].. - [NO] --------------------------------------------------....... - [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master - [WARNING]  async_io: please install the libaio-devel package with yum -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -async_io ............... [NO] ....... [NO] -torch cuda version ............... 11.1 -transformer_inference .. [NO] ....... [OKAY] -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -async_io ............... [NO] ....... [NO] -torch cuda version ............... 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -transformer_inference .. [NO] ....... [OKAY] -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -utils .................. [NO] ....... [OKAY] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install pathtorch version ................................... 1.8.1 -torch cuda version ...............['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -11.1 -nvcc versiontorch version ......................................... 11.21.8.1 -DeepSpeed general environment info: - -deepspeed install path ...........torch cuda version ...............['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -deepspeed infonvcc version ........................................ 0.5.5+29bee73, 29bee73, master11.2 - -deepspeed wheel compiled w.deepspeed install path ................. torch 1.8, cuda 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utils .................. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -quantizer .............. [NO]utils ......................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -nvcc version ..................... 11.2 -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -utils .................. [NO] ....... [OKAY] -quantizer ..............async_io [NO] ...................... [OKAY][NO] - ....... [NO] --------------------------------------------------- -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -DeepSpeed general environment info: -async_io ............... [NO] ....... [NO] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -transformer_inference .. [NO] ....... [OKAY] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -DeepSpeed general environment info: -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 --------------------------------------------------- -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ...............torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... torch version1.8.1 -.................... 1.8.1torch cuda version - ............... torch cuda version11.1 -............... nvcc version 11.1..................... - nvcc version11.2 -.....................deepspeed install path 11.2........... - deepspeed install path ...........['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -................... deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ......0.5.5+29bee73, 29bee73, master -torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yum -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  async_io: please install the libaio-devel package with yum -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -DeepSpeed general environment info:DeepSpeed general environment info: - -transformer_inference .. [NO] ....... [OKAY] -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -utils .................. [NO] ....... [OKAY] -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version -quantizer .............. [NO] ....... [OKAY] - nvcc version..................... .....................11.2 -11.2deepspeed install path --------------------------------------------------- - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed info - deepspeed info................... 0.5.5+29bee73, 29bee73, master................... - deepspeed wheel compiled w.0.5.5+29bee73, 29bee73, master -......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum -DeepSpeed general environment info: - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO]async_io ....... [NO]............... - [NO] ....... [NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................. [NO] ......................... [NO][OKAY] -....... [OKAY]quantizer - .............. quantizer[NO] ..................... [OKAY] - [NO] .......-------------------------------------------------- -[OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -async_io ............... [NO] ....... [NO] ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -ninjaninjaninja ...................................................... [OKAY] [OKAY] -[OKAY] - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -op name - op name................op name ................installed................ installedinstalled.. ....compatible -compatiblecompatible-------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... cpu_adamcpu_adam [NO] .............................. .......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -fused_adam .............fused_adam fused_adam[NO]............. .......[NO]............. [OKAY].......[NO] - [OKAY]....... - [OKAY]fused_lambfused_lamb - .......................... [NO]fused_lamb[NO] ....... ............. ....... [OKAY] -[NO][OKAY] -....... [OKAY] -sparse_attn ............ sparse_attn[NO] --------------------------------------------------...................sparse_attn - [OKAY]DeepSpeed C++/CUDA extension op report[NO]............ - - --------------------------------------------------.......[NO] -transformer NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ....... -[OKAY]............ -------------------------------------------------- -[OKAY] - -JIT compiled ops requires ninja[NO]transformer - transformer....... ........................ [OKAY] [NO] -[NO] .............. [OKAY][OKAY] -stochastic_transformer - . stochastic_transformer[NO]stochastic_transformer ....... ..[OKAY] - [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -utils .................. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum [WARNING]  async_io: please install the libaio-devel package with yum -cpu_adam ............... [NO] ....... [OKAY] - -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils ..................utils [NO].................. .......[NO] [OKAY]....... - [OKAY] -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... async_io[NO] ....... ...............[NO] -[NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] utils....... ..................[OKAY] -[NO] ....... [OKAY] -utils .................. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY]quantizer - .............. [NO] --------------------------------------------------....... - [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ...............['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']1.8.1 - -torch cuda versiontorch version ................................... 11.11.8.1 - -nvcc version .....................torch cuda version 11.2............... - deepspeed install path11.1 -...........nvcc version ..................... 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] - -deepspeed install path deepspeed info........... ................... 0.5.5+29bee73, 29bee73, master -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed']deepspeed wheel compiled w. - deepspeed info...... ...................torch 1.8, cuda 11.1 -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch versiontorch install path .................... ...............1.8.1 -torch cuda version ............... 11.1['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -nvcc version .....................torch version 11.2.................... - deepspeed install path1.8.1 -........... torch cuda version ...............['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -11.1 -deepspeed info nvcc version................... .....................0.5.5+29bee73, 29bee73, master -11.2 -deepspeed wheel compiled w.deepspeed install path ................. torch 1.8, cuda 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install pathDeepSpeed general environment info: ............... - torch install path ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']............... - torch version .................... 1.8.1['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch cuda version torch version............... ....................11.1 -1.8.1nvcc version - ..................... torch cuda version11.2 -...............deepspeed install path 11.1........... - nvcc version .....................['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] 11.2 - -deepspeed infodeepspeed install path .............................. 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w.['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -......deepspeed info torch 1.8, cuda 11.1 -................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main ******** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. [WARNING]  async_io requires the dev libaio .so object and headers but these were not found. - -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io: please install the libaio-devel package with yumutils .................. [NO] - ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc version nvcc version..................... .....................11.2 -11.2 -deepspeed install path deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info deepspeed info................... ...................0.5.5+29bee73, 29bee73, master -0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io: please install the libaio-devel package with yum - [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/DeepSpeed/deepspeed'] -deepspeed info ................... 0.5.5+29bee73, 29bee73, master -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -> setting tensorboard ... -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -**** Git info for Megatron: git_hash=bdc6ad6 git_branch=main **** -using world size: 128, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 32 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.95 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. None - consumed_train_samples .......................... 0 - consumed_train_tokens ........................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - curriculum_learning ............................. False - data_impl ....................................... mmap - data_parallel_size .............................. 1 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1768527.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 150 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 1190 - exit_interval ................................... None - ffn_hidden_size ................................. 46400 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - gigaflos_no_embeds .............................. 0 - global_batch_size ............................... 2048 - glu_activation .................................. None - hidden_dropout .................................. 0.1 - hidden_size ..................................... 11600 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.006 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 145 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_on_targets_only ............................ False - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 0.0001 - lr_decay_iters .................................. None - lr_decay_samples ................................ None - lr_decay_style .................................. cosine - lr_decay_tokens ................................. 260000000000 - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 3750000 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 80 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 64 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 32 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... None - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints - save_interval ................................... 300 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 43 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/tr8b-104B-logs/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 600000000 - train_tokens .................................... 300000000000 - use_bnb_optimizer ............................... False - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed-tr8b-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 128 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -setting number of micro-batches to constant 2048 -> building GPT2BPETokenizer tokenizer ... - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) -> initializing torch distributed ... -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 32 -> setting random seeds to 43 ... -[2021-10-30 09:44:10,628] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2761 and data parallel seed: 43 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/data' ->>> done with dataset index builder. Compilation time: 0.311 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 21.865 seconds -time to initialize megatron (seconds): -31.194 -[after megatron is initialized] datetime: 2021-10-30 09:44:32 -building GPT model ... -[2021-10-30 09:44:32,860] [INFO] [utils.py:806:see_memory_usage] Before Building Model -[2021-10-30 09:44:32,861] [INFO] [utils.py:807:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-10-30 09:44:32,861] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.67 GB, percent = 22.3% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=1, data=0, model=0): 4, ProcessCoord(pipe=1, data=0, model=1): 5, ProcessCoord(pipe=1, data=0, model=2): 6, ProcessCoord(pipe=1, data=0, model=3): 7, ProcessCoord(pipe=2, data=0, model=0): 8, ProcessCoord(pipe=2, data=0, model=1): 9, ProcessCoord(pipe=2, data=0, model=2): 10, ProcessCoord(pipe=2, data=0, model=3): 11, ProcessCoord(pipe=3, data=0, model=0): 12, ProcessCoord(pipe=3, data=0, model=1): 13, ProcessCoord(pipe=3, data=0, model=2): 14, ProcessCoord(pipe=3, data=0, model=3): 15, ProcessCoord(pipe=4, data=0, model=0): 16, ProcessCoord(pipe=4, data=0, model=1): 17, ProcessCoord(pipe=4, data=0, model=2): 18, ProcessCoord(pipe=4, data=0, model=3): 19, ProcessCoord(pipe=5, data=0, model=0): 20, ProcessCoord(pipe=5, data=0, model=1): 21, ProcessCoord(pipe=5, data=0, model=2): 22, ProcessCoord(pipe=5, data=0, model=3): 23, ProcessCoord(pipe=6, data=0, model=0): 24, ProcessCoord(pipe=6, data=0, model=1): 25, ProcessCoord(pipe=6, data=0, model=2): 26, ProcessCoord(pipe=6, data=0, model=3): 27, ProcessCoord(pipe=7, data=0, model=0): 28, ProcessCoord(pipe=7, data=0, model=1): 29, ProcessCoord(pipe=7, data=0, model=2): 30, ProcessCoord(pipe=7, data=0, model=3): 31, ProcessCoord(pipe=8, data=0, model=0): 32, ProcessCoord(pipe=8, data=0, model=1): 33, ProcessCoord(pipe=8, data=0, model=2): 34, ProcessCoord(pipe=8, data=0, model=3): 35, ProcessCoord(pipe=9, data=0, model=0): 36, ProcessCoord(pipe=9, data=0, model=1): 37, ProcessCoord(pipe=9, data=0, model=2): 38, ProcessCoord(pipe=9, data=0, model=3): 39, ProcessCoord(pipe=10, data=0, model=0): 40, ProcessCoord(pipe=10, data=0, model=1): 41, ProcessCoord(pipe=10, data=0, model=2): 42, ProcessCoord(pipe=10, data=0, model=3): 43, ProcessCoord(pipe=11, data=0, model=0): 44, ProcessCoord(pipe=11, data=0, model=1): 45, ProcessCoord(pipe=11, data=0, model=2): 46, ProcessCoord(pipe=11, data=0, model=3): 47, ProcessCoord(pipe=12, data=0, model=0): 48, ProcessCoord(pipe=12, data=0, model=1): 49, ProcessCoord(pipe=12, data=0, model=2): 50, ProcessCoord(pipe=12, data=0, model=3): 51, ProcessCoord(pipe=13, data=0, model=0): 52, ProcessCoord(pipe=13, data=0, model=1): 53, ProcessCoord(pipe=13, data=0, model=2): 54, ProcessCoord(pipe=13, data=0, model=3): 55, ProcessCoord(pipe=14, data=0, model=0): 56, ProcessCoord(pipe=14, data=0, model=1): 57, ProcessCoord(pipe=14, data=0, model=2): 58, ProcessCoord(pipe=14, data=0, model=3): 59, ProcessCoord(pipe=15, data=0, model=0): 60, ProcessCoord(pipe=15, data=0, model=1): 61, ProcessCoord(pipe=15, data=0, model=2): 62, ProcessCoord(pipe=15, data=0, model=3): 63, ProcessCoord(pipe=16, data=0, model=0): 64, ProcessCoord(pipe=16, data=0, model=1): 65, ProcessCoord(pipe=16, data=0, model=2): 66, ProcessCoord(pipe=16, data=0, model=3): 67, ProcessCoord(pipe=17, data=0, model=0): 68, ProcessCoord(pipe=17, data=0, model=1): 69, ProcessCoord(pipe=17, data=0, model=2): 70, ProcessCoord(pipe=17, data=0, model=3): 71, ProcessCoord(pipe=18, data=0, model=0): 72, ProcessCoord(pipe=18, data=0, model=1): 73, ProcessCoord(pipe=18, data=0, model=2): 74, ProcessCoord(pipe=18, data=0, model=3): 75, ProcessCoord(pipe=19, data=0, model=0): 76, ProcessCoord(pipe=19, data=0, model=1): 77, ProcessCoord(pipe=19, data=0, model=2): 78, ProcessCoord(pipe=19, data=0, model=3): 79, ProcessCoord(pipe=20, data=0, model=0): 80, ProcessCoord(pipe=20, data=0, model=1): 81, ProcessCoord(pipe=20, data=0, model=2): 82, ProcessCoord(pipe=20, data=0, model=3): 83, ProcessCoord(pipe=21, data=0, model=0): 84, ProcessCoord(pipe=21, data=0, model=1): 85, ProcessCoord(pipe=21, data=0, model=2): 86, ProcessCoord(pipe=21, data=0, model=3): 87, ProcessCoord(pipe=22, data=0, model=0): 88, ProcessCoord(pipe=22, data=0, model=1): 89, ProcessCoord(pipe=22, data=0, model=2): 90, ProcessCoord(pipe=22, data=0, model=3): 91, ProcessCoord(pipe=23, data=0, model=0): 92, ProcessCoord(pipe=23, data=0, model=1): 93, ProcessCoord(pipe=23, data=0, model=2): 94, ProcessCoord(pipe=23, data=0, model=3): 95, ProcessCoord(pipe=24, data=0, model=0): 96, ProcessCoord(pipe=24, data=0, model=1): 97, ProcessCoord(pipe=24, data=0, model=2): 98, ProcessCoord(pipe=24, data=0, model=3): 99, ProcessCoord(pipe=25, data=0, model=0): 100, ProcessCoord(pipe=25, data=0, model=1): 101, ProcessCoord(pipe=25, data=0, model=2): 102, ProcessCoord(pipe=25, data=0, model=3): 103, ProcessCoord(pipe=26, data=0, model=0): 104, ProcessCoord(pipe=26, data=0, model=1): 105, ProcessCoord(pipe=26, data=0, model=2): 106, ProcessCoord(pipe=26, data=0, model=3): 107, ProcessCoord(pipe=27, data=0, model=0): 108, ProcessCoord(pipe=27, data=0, model=1): 109, ProcessCoord(pipe=27, data=0, model=2): 110, ProcessCoord(pipe=27, data=0, model=3): 111, ProcessCoord(pipe=28, data=0, model=0): 112, ProcessCoord(pipe=28, data=0, model=1): 113, ProcessCoord(pipe=28, data=0, model=2): 114, ProcessCoord(pipe=28, data=0, model=3): 115, ProcessCoord(pipe=29, data=0, model=0): 116, ProcessCoord(pipe=29, data=0, model=1): 117, ProcessCoord(pipe=29, data=0, model=2): 118, ProcessCoord(pipe=29, data=0, model=3): 119, ProcessCoord(pipe=30, data=0, model=0): 120, ProcessCoord(pipe=30, data=0, model=1): 121, ProcessCoord(pipe=30, data=0, model=2): 122, ProcessCoord(pipe=30, data=0, model=3): 123, ProcessCoord(pipe=31, data=0, model=0): 124, ProcessCoord(pipe=31, data=0, model=1): 125, ProcessCoord(pipe=31, data=0, model=2): 126, ProcessCoord(pipe=31, data=0, model=3): 127} -[2021-10-30 09:44:34,533] [INFO] [module.py:365:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=5 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe -stage=1 layers=2 - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=2 layers=2 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe -stage=3 layers=2 - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=4 layers=2 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe -stage=5 layers=2 - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=6 layers=2 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe -stage=7 layers=2 - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=8 layers=2 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe -stage=9 layers=2 - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=10 layers=2 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe -stage=11 layers=2 - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=12 layers=2 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe -stage=13 layers=2 - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=14 layers=2 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe -stage=15 layers=2 - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe -stage=16 layers=2 - 35: ParallelTransformerLayerPipe - 36: ParallelTransformerLayerPipe -stage=17 layers=2 - 37: ParallelTransformerLayerPipe - 38: ParallelTransformerLayerPipe -stage=18 layers=2 - 39: ParallelTransformerLayerPipe - 40: ParallelTransformerLayerPipe -stage=19 layers=2 - 41: ParallelTransformerLayerPipe - 42: ParallelTransformerLayerPipe -stage=20 layers=2 - 43: ParallelTransformerLayerPipe - 44: ParallelTransformerLayerPipe -stage=21 layers=2 - 45: ParallelTransformerLayerPipe - 46: ParallelTransformerLayerPipe -stage=22 layers=2 - 47: ParallelTransformerLayerPipe - 48: ParallelTransformerLayerPipe -stage=23 layers=2 - 49: ParallelTransformerLayerPipe - 50: ParallelTransformerLayerPipe -stage=24 layers=2 - 51: ParallelTransformerLayerPipe - 52: ParallelTransformerLayerPipe -stage=25 layers=2 - 53: ParallelTransformerLayerPipe - 54: ParallelTransformerLayerPipe -stage=26 layers=2 - 55: ParallelTransformerLayerPipe - 56: ParallelTransformerLayerPipe -stage=27 layers=2 - 57: ParallelTransformerLayerPipe - 58: ParallelTransformerLayerPipe -stage=28 layers=2 - 59: ParallelTransformerLayerPipe - 60: ParallelTransformerLayerPipe -stage=29 layers=2 - 61: ParallelTransformerLayerPipe - 62: ParallelTransformerLayerPipe -stage=30 layers=2 - 63: ParallelTransformerLayerPipe - 64: ParallelTransformerLayerPipe -stage=31 layers=6 - 65: ParallelTransformerLayerPipe - 66: ParallelTransformerLayerPipe - 67: - 68: MixedFusedLayerNorm - 69: EmbeddingPipe - 70: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (1, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 13): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 13): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (1, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 26): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 13): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 10): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (0, 10): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 14): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 21): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 27): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 17): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 29): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (2, 29): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 19): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 11): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 18): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 28): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 28): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 25): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 24): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 15): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 12): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 23): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 30): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 22): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 29): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 9): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 28): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 20): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 10): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (2, 16): 807539800 > number of parameters on (tensor, pipeline) model parallel rank (1, 16): 807539800 - - > number of parameters on (tensor, pipeline) model parallel rank (3, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 16): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 8): 807539800 - > number of parameters on (tensor, pipeline) model parallel rank (0, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 978291800 - > number of parameters on (tensor, pipeline) model parallel rank (3, 31): 978315000 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 978291800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 978291800 -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - > number of parameters on (tensor, pipeline) model parallel rank (1, 31): 978315000 -[2021-10-30 09:44:35,232] [INFO] [utils.py:806:see_memory_usage] After Building Model -[2021-10-30 09:44:35,233] [INFO] [utils.py:807:see_memory_usage] MA 1.88 GB Max_MA 1.88 GB CA 1.91 GB Max_CA 2 GB -[2021-10-30 09:44:35,233] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.83 GB, percent = 22.3% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 978291800 -setting training iterations to 292968 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-10-30 09:44:35,234] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.5.5+29bee73, git-hash=29bee73, git-branch=master -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-30 09:44:35,271] [INFO] [engine.py:207:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-10-30 09:44:35,271] [INFO] [engine.py:862:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-10-30 09:44:35,271] [INFO] [engine.py:868:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-10-30 09:44:35,271] [INFO] [engine.py:884:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-10-30 09:44:35,271] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-10-30 09:44:35,272] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-10-30 09:44:35,272] [INFO] [stage2.py:111:__init__] Reduce bucket size 500000000 -[2021-10-30 09:44:35,272] [INFO] [stage2.py:112:__init__] Allgather bucket size 500000000 -[2021-10-30 09:44:35,272] [INFO] [stage2.py:113:__init__] CPU Offload: False -[2021-10-30 09:44:35,272] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Time to load utils op: 0.6180896759033203 seconds -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... -Loading extension module utils... - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Loading extension module utils... -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils... - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... - - - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - -Loading extension module utils... - -Time to load utils op: 0.6262867450714111 secondsTime to load utils op: 0.6294617652893066 secondsTime to load utils op: 0.6292548179626465 seconds - - -Time to load utils op: 0.6352829933166504 seconds -Time to load utils op: 0.6306872367858887 seconds -Time to load utils op: 0.6304712295532227 secondsTime to load utils op: 0.6276333332061768 seconds - -Time to load utils op: 0.6311967372894287 seconds -Time to load utils op: 0.6298584938049316 seconds -Time to load utils op: 0.6305150985717773 seconds -Time to load utils op: 0.6334264278411865 seconds -Time to load utils op: 0.6328375339508057 seconds -Time to load utils op: 0.6302139759063721 seconds -Time to load utils op: 0.6206560134887695 secondsTime to load utils op: 0.6207807064056396 seconds - -Time to load utils op: 0.632291316986084 seconds -Time to load utils op: 0.6348419189453125 seconds -Time to load utils op: 0.6306092739105225 seconds -Time to load utils op: 0.6213040351867676 seconds -Time to load utils op: 0.6179673671722412 seconds -Time to load utils op: 0.6315743923187256 secondsTime to load utils op: 0.6284277439117432 secondsTime to load utils op: 0.6293516159057617 seconds - - -Time to load utils op: 0.6269958019256592 seconds -Time to load utils op: 0.6274702548980713 secondsTime to load utils op: 0.6198415756225586 secondsTime to load utils op: 0.6244058609008789 seconds - - -Time to load utils op: 0.6206121444702148 seconds -Time to load utils op: 0.6299448013305664 seconds -Time to load utils op: 0.6335151195526123 secondsTime to load utils op: 0.6231951713562012 secondsTime to load utils op: 0.6233475208282471 seconds - -Time to load utils op: 0.6364481449127197 seconds -Time to load utils op: 0.6304469108581543 secondsTime to load utils op: 0.6265442371368408 seconds - - -Time to load utils op: 0.6234850883483887 seconds -Time to load utils op: 0.6334171295166016 secondsTime to load utils op: 0.6367671489715576 seconds - -Time to load utils op: 0.6294541358947754 seconds -Time to load utils op: 0.6280081272125244 seconds -Time to load utils op: 0.6292004585266113 secondsTime to load utils op: 0.6337339878082275 seconds - -Time to load utils op: 0.6334397792816162 seconds -Time to load utils op: 0.63311767578125 seconds -Time to load utils op: 0.6303987503051758 seconds -Time to load utils op: 0.6301193237304688 seconds -Time to load utils op: 0.6329841613769531 seconds -Time to load utils op: 0.6275994777679443 seconds -Time to load utils op: 0.6356687545776367 seconds -Time to load utils op: 0.6297204494476318 secondsTime to load utils op: 0.6291308403015137 seconds - -Time to load utils op: 0.6294996738433838 seconds -Time to load utils op: 0.6296694278717041 secondsTime to load utils op: 0.6338145732879639 secondsTime to load utils op: 0.6327474117279053 secondsTime to load utils op: 0.629540205001831 seconds - - -Time to load utils op: 0.6250736713409424 seconds - -Time to load utils op: 0.6258294582366943 secondsTime to load utils op: 0.6242146492004395 seconds - -Time to load utils op: 0.6217856407165527 seconds -Time to load utils op: 0.628856897354126 seconds -Time to load utils op: 0.6319530010223389 seconds -Time to load utils op: 0.630760669708252 seconds -Time to load utils op: 0.6317846775054932 seconds -Time to load utils op: 0.5562624931335449 secondsTime to load utils op: 0.5309178829193115 secondsTime to load utils op: 0.5350449085235596 seconds - - -Time to load utils op: 0.5346157550811768 seconds -Time to load utils op: 0.6277985572814941 seconds -Time to load utils op: 0.6311564445495605 seconds -Time to load utils op: 0.6338860988616943 seconds -Time to load utils op: 0.6244940757751465 seconds -Time to load utils op: 0.638434886932373 seconds -Time to load utils op: 0.6354131698608398 secondsTime to load utils op: 0.6342980861663818 seconds - -Time to load utils op: 0.6368227005004883 seconds -Time to load utils op: 0.6305932998657227 secondsTime to load utils op: 0.6307611465454102 seconds - -Time to load utils op: 0.6318752765655518 secondsTime to load utils op: 0.6322288513183594 seconds - -Time to load utils op: 0.6304688453674316 secondsTime to load utils op: 0.6340742111206055 seconds - -Time to load utils op: 0.6373560428619385 seconds -Time to load utils op: 0.6303048133850098 seconds -Time to load utils op: 0.6300461292266846 seconds -Time to load utils op: 0.6248974800109863 secondsTime to load utils op: 0.6244962215423584 secondsTime to load utils op: 0.6251780986785889 seconds - - -Time to load utils op: 0.6328983306884766 seconds -Time to load utils op: 0.6302089691162109 seconds -Time to load utils op: 0.6291813850402832 secondsTime to load utils op: 0.6258487701416016 seconds - -Time to load utils op: 0.6304867267608643 secondsTime to load utils op: 0.6268188953399658 secondsTime to load utils op: 0.6298575401306152 seconds -Time to load utils op: 0.6310896873474121 seconds - - -Time to load utils op: 0.6290428638458252 secondsTime to load utils op: 0.6298849582672119 seconds - -Time to load utils op: 0.6324224472045898 seconds -Time to load utils op: 0.6313555240631104 seconds -Time to load utils op: 0.634448766708374 seconds -Time to load utils op: 0.6260921955108643 secondsTime to load utils op: 0.6245455741882324 seconds -Time to load utils op: 0.6283373832702637 secondsTime to load utils op: 0.6279399394989014 seconds - -Time to load utils op: 0.6270978450775146 seconds -Time to load utils op: 0.6229701042175293 seconds -Time to load utils op: 0.6254112720489502 seconds - -Time to load utils op: 0.5415327548980713 secondsTime to load utils op: 0.5320243835449219 seconds -Time to load utils op: 0.5382711887359619 seconds - -Time to load utils op: 0.6374247074127197 secondsTime to load utils op: 0.6284232139587402 seconds -Time to load utils op: 0.6285510063171387 seconds -Time to load utils op: 0.6306931972503662 seconds -Time to load utils op: 0.6335763931274414 seconds - -Time to load utils op: 0.536135196685791 seconds -Time to load utils op: 0.637624979019165 seconds -Time to load utils op: 0.6321494579315186 secondsTime to load utils op: 0.6303496360778809 seconds - -Time to load utils op: 0.6402685642242432 seconds -Time to load utils op: 0.646906852722168 secondsTime to load utils op: 0.6500799655914307 seconds -Time to load utils op: 0.6404604911804199 seconds - -Loading extension module utils...Loading extension module utils...Loading extension module utils... - - -Time to load utils op: 0.6542537212371826 seconds -Time to load utils op: 0.6507151126861572 secondsTime to load utils op: 0.6554687023162842 seconds - -Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] -Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] -Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.002020120620727539 seconds -Time to load utils op: 0.0022780895233154297 seconds -Time to load utils op: 0.0020029544830322266 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001291036605834961 seconds -Time to load utils op: 0.0010788440704345703 seconds -Time to load utils op: 0.0010650157928466797 seconds -Time to load utils op: 0.0010895729064941406 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013985633850097656 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011162757873535156 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010590553283691406 seconds -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Time to load utils op: 0.0010857582092285156 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013730525970458984 seconds -Time to load utils op: 0.001123189926147461 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001062631607055664 secondsTime to load utils op: 0.001050710678100586 seconds - -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011878013610839844 seconds -Time to load utils op: 0.0012023448944091797 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009913444519042969 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.00118255615234375 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011591911315917969 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.0010571479797363281 seconds -Time to load utils op: 0.0014011859893798828 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Time to load utils op: 0.001051187515258789 seconds -Loading extension module utils... -Time to load utils op: 0.001161336898803711 seconds -Time to load utils op: 0.0010318756103515625 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0014355182647705078 seconds -Time to load utils op: 0.000995635986328125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013566017150878906 seconds -Loading extension module utils... -Time to load utils op: 0.0010120868682861328 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013942718505859375 seconds -Time to load utils op: 0.0014007091522216797 seconds -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013074874877929688 seconds -Time to load utils op: 0.0010864734649658203 seconds -Time to load utils op: 0.0011525154113769531 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001153707504272461 seconds -Time to load utils op: 0.0010628700256347656 seconds -Time to load utils op: 0.0010371208190917969 seconds -Time to load utils op: 0.0012423992156982422 secondsTime to load utils op: 0.0010576248168945312 seconds - -Time to load utils op: 0.0010426044464111328 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.0010325908660888672 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Time to load utils op: 0.0012753009796142578 seconds -Time to load utils op: 0.001087188720703125 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009613037109375 seconds -Time to load utils op: 0.0010383129119873047 seconds -Time to load utils op: 0.0010156631469726562 seconds -Time to load utils op: 0.0009918212890625 seconds -Time to load utils op: 0.0010385513305664062 seconds -Time to load utils op: 0.0012362003326416016 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012619495391845703 seconds -Time to load utils op: 0.0009469985961914062 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001008749008178711 seconds -Time to load utils op: 0.0010106563568115234 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Time to load utils op: 0.001394510269165039 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0010187625885009766 seconds -Time to load utils op: 0.0009679794311523438 seconds -Time to load utils op: 0.0009686946868896484 seconds -Time to load utils op: 0.0009598731994628906 seconds -Time to load utils op: 0.0010085105895996094 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...Loading extension module utils... -Time to load utils op: 0.0009698867797851562 seconds -Time to load utils op: 0.0011546611785888672 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0009789466857910156 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0014872550964355469 seconds -No modifications detected for re-loaded extension module utils, skipping build step... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.00139617919921875 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0015497207641601562 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0013623237609863281 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.001390218734741211 seconds -Time to load utils op: 0.00130462646484375 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.000993490219116211 seconds -Time to load utils op: 0.0012733936309814453 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013952255249023438 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.0009982585906982422 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0013072490692138672 seconds -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010991096496582031 secondsTime to load utils op: 0.0012900829315185547 seconds - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0010266304016113281 seconds -Loading extension module utils... -Time to load utils op: 0.0010669231414794922 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0011715888977050781 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -Time to load utils op: 0.0011365413665771484 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0014011859893798828 seconds -Loading extension module utils... -Time to load utils op: 0.0011205673217773438 seconds -Time to load utils op: 0.0011856555938720703 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0014243125915527344 seconds -Time to load utils op: 0.0011975765228271484 seconds -Time to load utils op: 0.0011017322540283203 seconds -Time to load utils op: 0.0010197162628173828 seconds -Time to load utils op: 0.0012240409851074219 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0012145042419433594 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.0010008811950683594 seconds -Time to load utils op: 0.0011489391326904297 seconds -Time to load utils op: 0.0013022422790527344 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Time to load utils op: 0.0010242462158203125 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.001138925552368164 seconds -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0009808540344238281 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0010929107666015625 seconds -Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Time to load utils op: 0.0011401176452636719 secondsTime to load utils op: 0.001129150390625 seconds - -Time to load utils op: 0.0010247230529785156 seconds -Loading extension module utils... -Time to load utils op: 0.0012814998626708984 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -Time to load utils op: 0.0011012554168701172 seconds -Time to load utils op: 0.0011012554168701172 seconds -Time to load utils op: 0.0010693073272705078 seconds -Time to load utils op: 0.0009930133819580078 seconds -Time to load utils op: 0.000978231430053711 seconds -Time to load utils op: 0.0010275840759277344 seconds -Time to load utils op: 0.0009703636169433594 seconds -Time to load utils op: 0.0010657310485839844 seconds -Time to load utils op: 0.0014011859893798828 seconds -Time to load utils op: 0.0014178752899169922 seconds -Time to load utils op: 0.0010652542114257812 seconds -Time to load utils op: 0.0012700557708740234 seconds -Time to load utils op: 0.0010499954223632812 seconds -Time to load utils op: 0.0013010501861572266 seconds -Time to load utils op: 0.0010936260223388672 seconds -Time to load utils op: 0.00151824951171875 seconds -Time to load utils op: 0.0015254020690917969 seconds -Time to load utils op: 0.001260519027709961 seconds -Time to load utils op: 0.001306295394897461 seconds -Time to load utils op: 0.0013914108276367188 seconds -Time to load utils op: 0.0015461444854736328 seconds -Time to load utils op: 0.001146078109741211 seconds -Time to load utils op: 0.0012574195861816406 seconds -Time to load utils op: 0.0014197826385498047 seconds -[2021-10-30 09:44:37,849] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states -[2021-10-30 09:44:37,850] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -[2021-10-30 09:44:37,850] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.85 GB, percent = 22.4% -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... - -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... - -Loading extension module utils... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0012447834014892578 seconds -Time to load utils op: 0.001094818115234375 seconds -Time to load utils op: 0.00115966796875 seconds -Time to load utils op: 0.0010302066802978516 seconds -[2021-10-30 09:44:37,902] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states -[2021-10-30 09:44:37,903] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB -[2021-10-30 09:44:37,903] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.85 GB, percent = 22.4% -[2021-10-30 09:44:37,903] [INFO] [stage2.py:474:__init__] optimizer state initialized -[2021-10-30 09:44:37,938] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer -[2021-10-30 09:44:37,939] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB -[2021-10-30 09:44:37,939] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 41.85 GB, percent = 22.4% -[2021-10-30 09:44:37,939] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-10-30 09:44:37,939] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-10-30 09:44:37,939] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-10-30 09:44:37,939] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] -[2021-10-30 09:44:37,939] [INFO] [config.py:940:print] DeepSpeedEngine configuration: -[2021-10-30 09:44:37,939] [INFO] [config.py:944:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-10-30 09:44:37,939] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-10-30 09:44:37,939] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False -[2021-10-30 09:44:37,939] [INFO] [config.py:944:print] amp_enabled .................. False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] amp_params ................... False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] curriculum_enabled ........... True -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] dataloader_drop_last ......... False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] disable_allgather ............ False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] dump_state ................... False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] eigenvalue_enabled ........... False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] eigenvalue_verbose ........... False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] elasticity_enabled ........... False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] fp16_enabled ................. True -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] global_rank .................. 0 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] loss_scale ................... 0 -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] memory_breakdown ............. False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] optimizer_name ............... None -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] optimizer_params ............. None -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] pld_enabled .................. False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] pld_params ................... False -[2021-10-30 09:44:37,940] [INFO] [config.py:944:print] prescale_gradients ........... False -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_groups .............. 1 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_offset .............. 1000 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_period .............. 1000 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_rounding ............ 0 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_start_bits .......... 16 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_target_bits ......... 8 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_training_enabled .... False -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_type ................ 0 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] quantize_verbose ............. False -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] scheduler_name ............... None -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] scheduler_params ............. None -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] sparse_attention ............. None -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] steps_per_print .............. 2000 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] tensorboard_enabled .......... False -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] tensorboard_output_path ...... -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] train_batch_size ............. 2048 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] use_quantizer_kernel ......... False -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] wall_clock_breakdown ......... False -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] world_size ................... 1 -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] zero_allow_untested_optimizer False -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] zero_enabled ................. True -[2021-10-30 09:44:37,941] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 -[2021-10-30 09:44:37,941] [INFO] [config.py:946:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "curriculum_learning": { - "enabled": true, - "curriculum_type": "seqlen", - "min_difficulty": 64, - "max_difficulty": 2.048000e+03, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 3.600000e+04, - "difficulty_step": 8 - } - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0007965564727783203 seconds -[2021-10-30 09:44:37,942] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) -[2021-10-30 09:44:38,337] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) - > using checkpoint value 0.0001 for learning rate - > using checkpoint value 6e-06 for minimum learning rate - > using checkpoint value 3750000 for warmup iterations - > using checkpoint value 600000000 for total number of iterations - > using checkpoint value cosine for decay style -successfully loaded 1 ZeRO state_dicts for rank 30 -successfully loaded 1 ZeRO state_dicts for rank 51 -successfully loaded 1 ZeRO state_dicts for rank 36 -successfully loaded 1 ZeRO state_dicts for rank 120 -successfully loaded 1 ZeRO state_dicts for rank 123 -successfully loaded 1 ZeRO state_dicts for rank 31 -successfully loaded 1 ZeRO state_dicts for rank 121 -successfully loaded 1 ZeRO state_dicts for rank 39 -successfully loaded 1 ZeRO state_dicts for rank 54 -successfully loaded 1 ZeRO state_dicts for rank 44 -successfully loaded 1 ZeRO state_dicts for rank 48 -successfully loaded 1 ZeRO state_dicts for rank 98 -successfully loaded 1 ZeRO state_dicts for rank 103 -successfully loaded 1 ZeRO state_dicts for rank 49 -successfully loaded 1 ZeRO state_dicts for rank 111 -successfully loaded 1 ZeRO state_dicts for rank 72 -successfully loaded 1 ZeRO state_dicts for rank 55 -successfully loaded 1 ZeRO state_dicts for rank 38 -successfully loaded 1 ZeRO state_dicts for rank 28 -successfully loaded 1 ZeRO state_dicts for rank 53 -successfully loaded 1 ZeRO state_dicts for rank 122 -successfully loaded 1 ZeRO state_dicts for rank 113 -successfully loaded 1 ZeRO state_dicts for rank 26 -successfully loaded 1 ZeRO state_dicts for rank 92 -successfully loaded 1 ZeRO state_dicts for rank 104 -successfully loaded 1 ZeRO state_dicts for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 37 -successfully loaded 1 ZeRO state_dicts for rank 79 -successfully loaded 1 ZeRO state_dicts for rank 90 -successfully loaded 1 ZeRO state_dicts for rank 66 -successfully loaded 1 ZeRO state_dicts for rank 96 -successfully loaded 1 ZeRO state_dicts for rank 78 -successfully loaded 1 ZeRO state_dicts for rank 50 -loading 1 zero partition checkpoints for rank 51 -successfully loaded 1 ZeRO state_dicts for rank 52 -successfully loaded 1 ZeRO state_dicts for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 71 -successfully loaded 1 ZeRO state_dicts for rank 84 -successfully loaded 1 ZeRO state_dicts for rank 108 -successfully loaded 1 ZeRO state_dicts for rank 64 -successfully loaded 1 ZeRO state_dicts for rank 25 -successfully loaded 1 ZeRO state_dicts for rank 16 -successfully loaded 1 ZeRO state_dicts for rank 80 -successfully loaded 1 ZeRO state_dicts for rank 99 -successfully loaded 1 ZeRO state_dicts for rank 18 -successfully loaded 1 ZeRO state_dicts for rank 110 -loading 1 zero partition checkpoints for rank 36 -successfully loaded 1 ZeRO state_dicts for rank 76 -successfully loaded 1 ZeRO state_dicts for rank 88 -successfully loaded 1 ZeRO state_dicts for rank 112 -successfully loaded 1 ZeRO state_dicts for rank 70 -successfully loaded 1 ZeRO state_dicts for rank 97 -successfully loaded 1 ZeRO state_dicts for rank 114 -successfully loaded 1 ZeRO state_dicts for rank 11 -successfully loaded 1 ZeRO state_dicts for rank 102 -successfully loaded 1 ZeRO state_dicts for rank 77 -successfully loaded 1 ZeRO state_dicts for rank 45 -successfully loaded 1 ZeRO state_dicts for rank 10 -successfully loaded 1 ZeRO state_dicts for rank 19 -successfully loaded 1 ZeRO state_dicts for rank 81 -successfully loaded 1 ZeRO state_dicts for rank 57 -successfully loaded 1 ZeRO state_dicts for rank 101 -successfully loaded 1 ZeRO state_dicts for rank 59 -loading 1 zero partition checkpoints for rank 120 -successfully loaded 1 ZeRO state_dicts for rank 109 -successfully loaded 1 ZeRO state_dicts for rank 27 -successfully loaded 1 ZeRO state_dicts for rank 75 -successfully loaded 1 ZeRO state_dicts for rank 69 -loading 1 zero partition checkpoints for rank 39 -successfully loaded 1 ZeRO state_dicts for rank 40 -successfully loaded 1 ZeRO state_dicts for rank 68 -successfully loaded 1 ZeRO state_dicts for rank 65 -successfully loaded 1 ZeRO state_dicts for rank 56 -successfully loaded 1 ZeRO state_dicts for rank 17 -successfully loaded 1 ZeRO state_dicts for rank 105 -successfully loaded 1 ZeRO state_dicts for rank 93 -successfully loaded 1 ZeRO state_dicts for rank 91 -successfully loaded 1 ZeRO state_dicts for rank 6 -successfully loaded 1 ZeRO state_dicts for rank 58 -successfully loaded 1 ZeRO state_dicts for rank 83 -loading 1 zero partition checkpoints for rank 49 -successfully loaded 1 ZeRO state_dicts for rank 87 -successfully loaded 1 ZeRO state_dicts for rank 13 -loading 1 zero partition checkpoints for rank 72 -successfully loaded 1 ZeRO state_dicts for rank 42 -successfully loaded 1 ZeRO state_dicts for rank 74 -successfully loaded 1 ZeRO state_dicts for rank 73 -loading 1 zero partition checkpoints for rank 30 -successfully loaded 1 ZeRO state_dicts for rank 63 -successfully loaded 1 ZeRO state_dicts for rank 7 -successfully loaded 1 ZeRO state_dicts for rank 60 -loading 1 zero partition checkpoints for rank 121 -successfully loaded 1 ZeRO state_dicts for rank 115 -successfully loaded 1 ZeRO state_dicts for rank 15 -successfully loaded 1 ZeRO state_dicts for rank 82 -successfully loaded 1 ZeRO state_dicts for rank 24 -successfully loaded 1 ZeRO state_dicts for rank 86 -successfully loaded 1 ZeRO state_dicts for rank 62 -successfully loaded 1 ZeRO state_dicts for rank 117 -successfully loaded 1 ZeRO state_dicts for rank 95 -loading 1 zero partition checkpoints for rank 28 -successfully loaded 1 ZeRO state_dicts for rank 22 -successfully loaded 1 ZeRO state_dicts for rank 85 -successfully loaded 1 ZeRO state_dicts for rank 118 -successfully loaded 1 ZeRO state_dicts for rank 8 -loading 1 zero partition checkpoints for rank 26 -successfully loaded 1 ZeRO state_dicts for rank 20 -loading 1 zero partition checkpoints for rank 53 -loading 1 zero partition checkpoints for rank 123 -successfully loaded 1 ZeRO state_dicts for rank 89 -successfully loaded 1 ZeRO state_dicts for rank 23 -loading 1 zero partition checkpoints for rank 67 -successfully loaded 1 ZeRO state_dicts for rank 4 -loading 1 zero partition checkpoints for rank 104 -loading 1 zero partition checkpoints for rank 100 -successfully loaded 1 ZeRO state_dicts for rank 94 -successfully loaded 1 ZeRO state_dicts for rank 21 -successfully loaded 1 ZeRO state_dicts for rank 32 -loading 1 zero partition checkpoints for rank 31 -loading 1 zero partition checkpoints for rank 44 -loading 1 zero partition checkpoints for rank 78 -loading 1 zero partition checkpoints for rank 54 -loading 1 zero partition checkpoints for rank 48 -successfully loaded 1 ZeRO state_dicts for rank 107 -successfully loaded 1 ZeRO state_dicts for rank 35 -loading 1 zero partition checkpoints for rank 110 -loading 1 zero partition checkpoints for rank 52 -successfully loaded 1 ZeRO state_dicts for rank 106 -loading 1 zero partition checkpoints for rank 96 -loading 1 zero partition checkpoints for rank 98 -loading 1 zero partition checkpoints for rank 103 -loading 1 zero partition checkpoints for rank 111 -successfully loaded 1 ZeRO state_dicts for rank 61 -loading 1 zero partition checkpoints for rank 76 -loading 1 zero partition checkpoints for rank 70 -loading 1 zero partition checkpoints for rank 55 -successfully loaded 1 ZeRO state_dicts for rank 9 -loading 1 zero partition checkpoints for rank 114 -successfully loaded 1 ZeRO state_dicts for rank 29 -loading 1 zero partition checkpoints for rank 102 -loading 1 zero partition checkpoints for rank 38 -successfully loaded 1 ZeRO state_dicts for rank 47 -loading 1 zero partition checkpoints for rank 81 -loading 1 zero partition checkpoints for rank 112 -successfully loaded 1 ZeRO state_dicts for rank 124 -successfully loaded 1 ZeRO state_dicts for rank 34 -successfully loaded 1 ZeRO state_dicts for rank 5 -successfully loaded 1 ZeRO state_dicts for rank 125 -loading 1 zero partition checkpoints for rank 19 -loading 1 zero partition checkpoints for rank 109 -loading 1 zero partition checkpoints for rank 75 -successfully loaded 1 ZeRO state_dicts for rank 12 -successfully loaded 1 ZeRO state_dicts for rank 41 -loading 1 zero partition checkpoints for rank 122 -loading 1 zero partition checkpoints for rank 65 -successfully loaded 1 ZeRO state_dicts for rank 33 -loading 1 zero partition checkpoints for rank 88 -loading 1 zero partition checkpoints for rank 56 -loading 1 zero partition checkpoints for rank 93 -loading 1 zero partition checkpoints for rank 113 -loading 1 zero partition checkpoints for rank 83 -loading 1 zero partition checkpoints for rank 58 -loading 1 zero partition checkpoints for rank 37 -loading 1 zero partition checkpoints for rank 92 -loading 1 zero partition checkpoints for rank 16 -loading 1 zero partition checkpoints for rank 84 -loading 1 zero partition checkpoints for rank 79 -loading 1 zero partition checkpoints for rank 97 -loading 1 zero partition checkpoints for rank 50 -loading 1 zero partition checkpoints for rank 66 -loading 1 zero partition checkpoints for rank 90 -loading 1 zero partition checkpoints for rank 6 -loading 1 zero partition checkpoints for rank 60 -loading 1 zero partition checkpoints for rank 108 -loading 1 zero partition checkpoints for rank 71 -successfully loaded 1 ZeRO state_dicts for rank 43 -loading 1 zero partition checkpoints for rank 63 -loading 1 zero partition checkpoints for rank 64 -loading 1 zero partition checkpoints for rank 91 -loading 1 zero partition checkpoints for rank 25 -loading 1 zero partition checkpoints for rank 68 -loading 1 zero partition checkpoints for rank 80 -loading 1 zero partition checkpoints for rank 87 -loading 1 zero partition checkpoints for rank 99 -loading 1 zero partition checkpoints for rank 18 -loading 1 zero partition checkpoints for rank 7 -loading 1 zero partition checkpoints for rank 24 -loading 1 zero partition checkpoints for rank 20 -loading 1 zero partition checkpoints for rank 45 -loading 1 zero partition checkpoints for rank 8 -loading 1 zero partition checkpoints for rank 23 -successfully loaded 1 ZeRO state_dicts for rank 116 -loading 1 zero partition checkpoints for rank 77 -loading 1 zero partition checkpoints for rank 11 -loading 1 zero partition checkpoints for rank 10 -loading 1 zero partition checkpoints for rank 57 -loading 1 zero partition checkpoints for rank 101 -loading 1 zero partition checkpoints for rank 32 -loading 1 zero partition checkpoints for rank 94 -loading 1 zero partition checkpoints for rank 27 -loading 1 zero partition checkpoints for rank 59 -loading 1 zero partition checkpoints for rank 40 -loading 1 zero partition checkpoints for rank 69 -successfully loaded 1 ZeRO state_dicts for rank 127 -loading 1 zero partition checkpoints for rank 105 -loading 1 zero partition checkpoints for rank 107 -loading 1 zero partition checkpoints for rank 17 -successfully loaded 1 ZeRO state_dicts for rank 14 -loading 1 zero partition checkpoints for rank 42 -loading 1 zero partition checkpoints for rank 13 -successfully loaded 1 ZeRO state_dicts for rank 46 -loading 1 zero partition checkpoints for rank 74 -loading 1 zero partition checkpoints for rank 73 -loading 1 zero partition checkpoints for rank 29 -successfully loaded 1 ZeRO state_dicts for rank 126 -loading 1 zero partition checkpoints for rank 115 -loading 1 zero partition checkpoints for rank 47 -loading 1 zero partition checkpoints for rank 82 -loading 1 zero partition checkpoints for rank 15 -loading 1 zero partition checkpoints for rank 9 -loading 1 zero partition checkpoints for rank 117 -loading 1 zero partition checkpoints for rank 62 -loading 1 zero partition checkpoints for rank 95 -loading 1 zero partition checkpoints for rank 118 -loading 1 zero partition checkpoints for rank 22 -loading 1 zero partition checkpoints for rank 86 -loading 1 zero partition checkpoints for rank 85 -loading 1 zero partition checkpoints for rank 41 -successfully loaded 1 ZeRO state_dicts for rank 3 -loading 1 zero partition checkpoints for rank 34 -loading 1 zero partition checkpoints for rank 12 -loading 1 zero partition checkpoints for rank 4 -loading 1 zero partition checkpoints for rank 89 -loading 1 zero partition checkpoints for rank 21 -successfully loaded 1 ZeRO state_dicts for rank 0 -successfully loaded 1 ZeRO state_dicts for rank 1 -successfully loaded 1 ZeRO state_dicts for rank 119 -loading 1 zero partition checkpoints for rank 106 -loading 1 zero partition checkpoints for rank 35 -loading 1 zero partition checkpoints for rank 61 -loading 1 zero partition checkpoints for rank 43 -loading 1 zero partition checkpoints for rank 124 -loading 1 zero partition checkpoints for rank 5 -successfully loaded 1 ZeRO state_dicts for rank 2 -loading 1 zero partition checkpoints for rank 116 -loading 1 zero partition checkpoints for rank 33 -loading 1 zero partition checkpoints for rank 14 -loading 1 zero partition checkpoints for rank 46 -loading 1 zero partition checkpoints for rank 125 -loading 1 zero partition checkpoints for rank 127 -loading 1 zero partition checkpoints for rank 119 -loading 1 zero partition checkpoints for rank 3 -loading 1 zero partition checkpoints for rank 0 -loading 1 zero partition checkpoints for rank 126 - checkpoint version 3.0 -loading 1 zero partition checkpoints for rank 1 -loading 1 zero partition checkpoints for rank 2 - successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints at iteration 3186 -time (ms) | load-checkpoint: 15312.17 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 125.2213504estimated model parameters: 125.2213504 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters without embeddings: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings - warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944estimated model parameters: 103.3650944 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 125.22432estimated model parameters: 125.22432 - -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters: 103.3650944 -estimated model parameters: 125.22432 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.3650944 -estimated model parameters without embeddings: 103.368064estimated model parameters without embeddings: 103.368064 - -estimated model parameters without embeddings: 103.368064 -estimated model parameters without embeddings: 103.368064 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-30 09:44:53 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 600000000 - validation: 20008960 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 4.522303 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.496 seconds - total number of samples: 657686117 - total number of epochs: 5 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_20008960ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.408 seconds - total number of samples: 20781483 - total number of epochs: 3 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy - loaded indexed file in 0.067 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-10-30 09:45:03 -done with setup ... -training ... -time (ms) | model-and-optimizer-setup: 20933.51 | train/valid/test-data-iterators-setup: 9543.04 -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - - -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters: 125.22432 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion - -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.2213504 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.368064 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters without embeddings: 103.3650944 billion -Number of parameters: 125.22432 billion -Number of parameters without embeddings: 103.368064 billion -[before the start of training step] datetime: 2021-10-30 09:45:04 -[2021-10-30 09:45:04,027] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information -[2021-10-30 09:45:04,028] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-10-30 09:45:04,028] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers -[2021-10-30 09:45:04,028] [INFO] [checkpointing.py:554:forward] ----Synchronization False -[2021-10-30 09:45:04,028] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False -[Rank 3] (after 3187 iterations) memory (MB) | allocated: 13203.33349609375 | max allocated: 20666.88232421875 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 11] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 7] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 19] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 15] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 2] (after 3187 iterations) memory (MB) | allocated: 13202.56640625 | max allocated: 20666.115234375 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 10] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 6] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 126] (after 3187 iterations) memory (MB) | allocated: 13114.45849609375 | max allocated: 20578.064453125 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 27] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 - iteration 3187/ 292968 | consumed samples: 6526976 | consumed tokens: 965033984 | elapsed time per iteration (ms): 168232.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.536748E+00 | loss scale: 131072.0 | grad norm: 48451.892 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -[Rank 39] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 35] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 31] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 43] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 47] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 23] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 51] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 127] (after 3187 iterations) memory (MB) | allocated: 13114.0087890625 | max allocated: 20577.61474609375 | reserved: 24404.0 | max reserved: 24404.0 -time (ms) -[Rank 59] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 63] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 55] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 75] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 67] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 79] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 71] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 83] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 87] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 95] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 91] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 103] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 99] (after 3187 iterations) memory (MB) | allocated: 10789.5224609375 | max allocated: 16949.7041015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 111] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 107] (after 3187 iterations) memory (MB) | allocated: 10789.5224609375 | max allocated: 16949.7041015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 18] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.634765625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 115] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 119] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 123] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 14] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 5] (after 3187 iterations) memory (MB) | allocated: 10789.228515625 | max allocated: 16949.41015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 1] (after 3187 iterations) memory (MB) | allocated: 13203.38427734375 | max allocated: 20666.93310546875 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 125] (after 3187 iterations) memory (MB) | allocated: 13114.45849609375 | max allocated: 20578.064453125 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 0] (after 3187 iterations) memory (MB) | allocated: 13203.9521484375 | max allocated: 20667.5009765625 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 13] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 9] (after 3187 iterations) memory (MB) | allocated: 10789.228515625 | max allocated: 16949.41015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 4] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 124] (after 3187 iterations) memory (MB) | allocated: 13114.0087890625 | max allocated: 20577.61474609375 | reserved: 24404.0 | max reserved: 24404.0 -[Rank 12] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 8] (after 3187 iterations) memory (MB) | allocated: 10789.18994140625 | max allocated: 16949.37158203125 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 16] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 17] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 24] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 28] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 36] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 29] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 33] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 44] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 37] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 40] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 25] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 45] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 41] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 48] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 52] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 53] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 21] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 49] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 56] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 57] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 64] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 65] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 61] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0[Rank 60] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 - -[Rank 76] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 32] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 69] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 20] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 77] (after 3187 iterations) memory (MB) | allocated: 10789.5224609375 | max allocated: 16949.7041015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 84] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 73] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 80] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 81] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 96] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 85] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 92] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 89] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 93] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 68] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 97] (after 3187 iterations) memory (MB) | allocated: 10789.5224609375 | max allocated: 16949.7041015625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 108] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 104] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 101] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 100] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 26] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 105] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 72] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 30] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 109] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 34] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 116] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 117] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 113] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 38] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 120] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 22] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 42] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 88] (after 3187 iterations) memory (MB) | allocated: 10789.48583984375 | max allocated: 16949.66748046875 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 121] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 46] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 54] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 50] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 58] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 66] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 62] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 112] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 70] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 74] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 78] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 86] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 82] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 90] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 98] (after 3187 iterations) memory (MB) | allocated: 10789.74267578125 | max allocated: 16949.92431640625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 94] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 106] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 102] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 110] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 114] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 118] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 -[Rank 122] (after 3187 iterations) memory (MB) | allocated: 10788.97314453125 | max allocated: 16949.15478515625 | reserved: 20072.0 | max reserved: 20072.0 - iteration 3188/ 292968 | consumed samples: 6529024 | consumed tokens: 965509120 | elapsed time per iteration (ms): 113516.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.536709E+00 | loss scale: 131072.0 | grad norm: 85199.359 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3189/ 292968 | consumed samples: 6531072 | consumed tokens: 965984256 | elapsed time per iteration (ms): 110920.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.555685E+00 | loss scale: 131072.0 | grad norm: 89947.231 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3190/ 292968 | consumed samples: 6533120 | consumed tokens: 966459392 | elapsed time per iteration (ms): 108604.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.597191E+00 | loss scale: 131072.0 | grad norm: 65128.796 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3191/ 292968 | consumed samples: 6535168 | consumed tokens: 966934528 | elapsed time per iteration (ms): 110127.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.573323E+00 | loss scale: 131072.0 | grad norm: 66733.021 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3192/ 292968 | consumed samples: 6537216 | consumed tokens: 967409664 | elapsed time per iteration (ms): 110870.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.555734E+00 | loss scale: 131072.0 | grad norm: 68561.231 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3193/ 292968 | consumed samples: 6539264 | consumed tokens: 967884800 | elapsed time per iteration (ms): 109027.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.552716E+00 | loss scale: 131072.0 | grad norm: 55978.416 | num zeros: 0.0 | curriculum seqlen: 232 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3194/ 292968 | consumed samples: 6541312 | consumed tokens: 968376320 | elapsed time per iteration (ms): 111345.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.527816E+00 | loss scale: 131072.0 | grad norm: 56456.172 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3195/ 292968 | consumed samples: 6543360 | consumed tokens: 968867840 | elapsed time per iteration (ms): 112383.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.536128E+00 | loss scale: 131072.0 | grad norm: 40825.930 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3196/ 292968 | consumed samples: 6545408 | consumed tokens: 969359360 | elapsed time per iteration (ms): 111701.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.549054E+00 | loss scale: 131072.0 | grad norm: 33016.961 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3197/ 292968 | consumed samples: 6547456 | consumed tokens: 969850880 | elapsed time per iteration (ms): 111124.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.536800E+00 | loss scale: 131072.0 | grad norm: 37411.700 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3198/ 292968 | consumed samples: 6549504 | consumed tokens: 970342400 | elapsed time per iteration (ms): 112697.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.536644E+00 | loss scale: 131072.0 | grad norm: 25965.144 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3199/ 292968 | consumed samples: 6551552 | consumed tokens: 970833920 | elapsed time per iteration (ms): 112599.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.548866E+00 | loss scale: 131072.0 | grad norm: 37333.916 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3200/ 292968 | consumed samples: 6553600 | consumed tokens: 971325440 | elapsed time per iteration (ms): 109898.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.529469E+00 | loss scale: 131072.0 | grad norm: 30425.769 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3201/ 292968 | consumed samples: 6555648 | consumed tokens: 971816960 | elapsed time per iteration (ms): 111025.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.525378E+00 | loss scale: 131072.0 | grad norm: 28364.841 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3202/ 292968 | consumed samples: 6557696 | consumed tokens: 972308480 | elapsed time per iteration (ms): 113004.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.517115E+00 | loss scale: 131072.0 | grad norm: 31346.971 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3203/ 292968 | consumed samples: 6559744 | consumed tokens: 972800000 | elapsed time per iteration (ms): 111773.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.514864E+00 | loss scale: 131072.0 | grad norm: 30253.961 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3204/ 292968 | consumed samples: 6561792 | consumed tokens: 973291520 | elapsed time per iteration (ms): 111844.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.508639E+00 | loss scale: 131072.0 | grad norm: 26693.316 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3205/ 292968 | consumed samples: 6563840 | consumed tokens: 973783040 | elapsed time per iteration (ms): 111641.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.505033E+00 | loss scale: 131072.0 | grad norm: 35395.987 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3206/ 292968 | consumed samples: 6565888 | consumed tokens: 974274560 | elapsed time per iteration (ms): 111253.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.513017E+00 | loss scale: 131072.0 | grad norm: 36651.727 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3207/ 292968 | consumed samples: 6567936 | consumed tokens: 974766080 | elapsed time per iteration (ms): 112641.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.488414E+00 | loss scale: 131072.0 | grad norm: 29253.627 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3208/ 292968 | consumed samples: 6569984 | consumed tokens: 975257600 | elapsed time per iteration (ms): 112517.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.517234E+00 | loss scale: 131072.0 | grad norm: 36065.346 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3209/ 292968 | consumed samples: 6572032 | consumed tokens: 975749120 | elapsed time per iteration (ms): 110959.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.495211E+00 | loss scale: 131072.0 | grad norm: 35210.357 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3210/ 292968 | consumed samples: 6574080 | consumed tokens: 976240640 | elapsed time per iteration (ms): 112092.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.504295E+00 | loss scale: 131072.0 | grad norm: 31153.585 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3211/ 292968 | consumed samples: 6576128 | consumed tokens: 976732160 | elapsed time per iteration (ms): 111543.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.484155E+00 | loss scale: 131072.0 | grad norm: 31093.055 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3212/ 292968 | consumed samples: 6578176 | consumed tokens: 977223680 | elapsed time per iteration (ms): 112375.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.517068E+00 | loss scale: 131072.0 | grad norm: 36879.384 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3213/ 292968 | consumed samples: 6580224 | consumed tokens: 977715200 | elapsed time per iteration (ms): 120962.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.503039E+00 | loss scale: 131072.0 | grad norm: 35040.139 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3214/ 292968 | consumed samples: 6582272 | consumed tokens: 978206720 | elapsed time per iteration (ms): 113382.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.466095E+00 | loss scale: 131072.0 | grad norm: 33437.550 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3215/ 292968 | consumed samples: 6584320 | consumed tokens: 978698240 | elapsed time per iteration (ms): 112542.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.495120E+00 | loss scale: 131072.0 | grad norm: 37763.151 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3216/ 292968 | consumed samples: 6586368 | consumed tokens: 979189760 | elapsed time per iteration (ms): 116877.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.512713E+00 | loss scale: 131072.0 | grad norm: 33507.271 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3217/ 292968 | consumed samples: 6588416 | consumed tokens: 979681280 | elapsed time per iteration (ms): 117907.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.482557E+00 | loss scale: 131072.0 | grad norm: 39554.999 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3218/ 292968 | consumed samples: 6590464 | consumed tokens: 980172800 | elapsed time per iteration (ms): 116288.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.497734E+00 | loss scale: 131072.0 | grad norm: 39573.342 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3219/ 292968 | consumed samples: 6592512 | consumed tokens: 980664320 | elapsed time per iteration (ms): 111430.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.489817E+00 | loss scale: 131072.0 | grad norm: 35532.232 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3220/ 292968 | consumed samples: 6594560 | consumed tokens: 981155840 | elapsed time per iteration (ms): 112903.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.512833E+00 | loss scale: 131072.0 | grad norm: 31799.118 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3221/ 292968 | consumed samples: 6596608 | consumed tokens: 981647360 | elapsed time per iteration (ms): 114703.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.487310E+00 | loss scale: 131072.0 | grad norm: 34516.900 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3222/ 292968 | consumed samples: 6598656 | consumed tokens: 982138880 | elapsed time per iteration (ms): 111287.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.526170E+00 | loss scale: 131072.0 | grad norm: 34583.913 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3223/ 292968 | consumed samples: 6600704 | consumed tokens: 982630400 | elapsed time per iteration (ms): 111750.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.512444E+00 | loss scale: 131072.0 | grad norm: 46389.879 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3224/ 292968 | consumed samples: 6602752 | consumed tokens: 983121920 | elapsed time per iteration (ms): 110586.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.528018E+00 | loss scale: 131072.0 | grad norm: 56823.203 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3225/ 292968 | consumed samples: 6604800 | consumed tokens: 983613440 | elapsed time per iteration (ms): 111959.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.523005E+00 | loss scale: 131072.0 | grad norm: 37028.503 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3226/ 292968 | consumed samples: 6606848 | consumed tokens: 984104960 | elapsed time per iteration (ms): 111724.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.504554E+00 | loss scale: 131072.0 | grad norm: 48541.196 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3227/ 292968 | consumed samples: 6608896 | consumed tokens: 984596480 | elapsed time per iteration (ms): 111009.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.515906E+00 | loss scale: 131072.0 | grad norm: 49264.491 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3228/ 292968 | consumed samples: 6610944 | consumed tokens: 985088000 | elapsed time per iteration (ms): 112649.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.516393E+00 | loss scale: 131072.0 | grad norm: 35496.510 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3229/ 292968 | consumed samples: 6612992 | consumed tokens: 985579520 | elapsed time per iteration (ms): 111470.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.514546E+00 | loss scale: 131072.0 | grad norm: 35849.260 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3230/ 292968 | consumed samples: 6615040 | consumed tokens: 986071040 | elapsed time per iteration (ms): 110878.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.514284E+00 | loss scale: 131072.0 | grad norm: 41469.878 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3231/ 292968 | consumed samples: 6617088 | consumed tokens: 986562560 | elapsed time per iteration (ms): 109910.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.490617E+00 | loss scale: 131072.0 | grad norm: 38896.131 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3232/ 292968 | consumed samples: 6619136 | consumed tokens: 987054080 | elapsed time per iteration (ms): 111673.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.512830E+00 | loss scale: 131072.0 | grad norm: 33070.186 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3233/ 292968 | consumed samples: 6621184 | consumed tokens: 987545600 | elapsed time per iteration (ms): 111505.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.527050E+00 | loss scale: 131072.0 | grad norm: 37932.338 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3234/ 292968 | consumed samples: 6623232 | consumed tokens: 988037120 | elapsed time per iteration (ms): 110767.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.520366E+00 | loss scale: 131072.0 | grad norm: 36172.854 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3235/ 292968 | consumed samples: 6625280 | consumed tokens: 988528640 | elapsed time per iteration (ms): 110564.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.498188E+00 | loss scale: 131072.0 | grad norm: 37528.273 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3236/ 292968 | consumed samples: 6627328 | consumed tokens: 989020160 | elapsed time per iteration (ms): 112747.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.517138E+00 | loss scale: 131072.0 | grad norm: 43856.052 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3237/ 292968 | consumed samples: 6629376 | consumed tokens: 989511680 | elapsed time per iteration (ms): 112824.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.533917E+00 | loss scale: 131072.0 | grad norm: 36516.059 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3238/ 292968 | consumed samples: 6631424 | consumed tokens: 990003200 | elapsed time per iteration (ms): 111098.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.498120E+00 | loss scale: 131072.0 | grad norm: 41361.365 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3239/ 292968 | consumed samples: 6633472 | consumed tokens: 990494720 | elapsed time per iteration (ms): 112615.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.509820E+00 | loss scale: 131072.0 | grad norm: 62598.160 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3240/ 292968 | consumed samples: 6635520 | consumed tokens: 990986240 | elapsed time per iteration (ms): 110191.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.483434E+00 | loss scale: 131072.0 | grad norm: 55741.853 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3241/ 292968 | consumed samples: 6637568 | consumed tokens: 991477760 | elapsed time per iteration (ms): 112055.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.534761E+00 | loss scale: 131072.0 | grad norm: 40162.102 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3242/ 292968 | consumed samples: 6639616 | consumed tokens: 991969280 | elapsed time per iteration (ms): 110944.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.518108E+00 | loss scale: 131072.0 | grad norm: 43256.029 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3243/ 292968 | consumed samples: 6641664 | consumed tokens: 992460800 | elapsed time per iteration (ms): 110612.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.505326E+00 | loss scale: 131072.0 | grad norm: 34049.259 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3244/ 292968 | consumed samples: 6643712 | consumed tokens: 992952320 | elapsed time per iteration (ms): 110996.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.469863E+00 | loss scale: 131072.0 | grad norm: 39566.431 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3245/ 292968 | consumed samples: 6645760 | consumed tokens: 993443840 | elapsed time per iteration (ms): 110711.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.496788E+00 | loss scale: 131072.0 | grad norm: 41042.556 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3246/ 292968 | consumed samples: 6647808 | consumed tokens: 993935360 | elapsed time per iteration (ms): 112623.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.505213E+00 | loss scale: 131072.0 | grad norm: 41738.715 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3247/ 292968 | consumed samples: 6649856 | consumed tokens: 994426880 | elapsed time per iteration (ms): 111326.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.486457E+00 | loss scale: 131072.0 | grad norm: 39878.696 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3248/ 292968 | consumed samples: 6651904 | consumed tokens: 994918400 | elapsed time per iteration (ms): 112574.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.469687E+00 | loss scale: 131072.0 | grad norm: 33922.146 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3249/ 292968 | consumed samples: 6653952 | consumed tokens: 995409920 | elapsed time per iteration (ms): 110648.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.471512E+00 | loss scale: 131072.0 | grad norm: 33383.898 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3250/ 292968 | consumed samples: 6656000 | consumed tokens: 995901440 | elapsed time per iteration (ms): 112336.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.539900E+00 | loss scale: 131072.0 | grad norm: 35349.622 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3251/ 292968 | consumed samples: 6658048 | consumed tokens: 996392960 | elapsed time per iteration (ms): 110933.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.479738E+00 | loss scale: 131072.0 | grad norm: 38928.251 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3252/ 292968 | consumed samples: 6660096 | consumed tokens: 996884480 | elapsed time per iteration (ms): 110855.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.492504E+00 | loss scale: 131072.0 | grad norm: 35033.057 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3253/ 292968 | consumed samples: 6662144 | consumed tokens: 997376000 | elapsed time per iteration (ms): 112026.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.490637E+00 | loss scale: 131072.0 | grad norm: 39358.440 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3254/ 292968 | consumed samples: 6664192 | consumed tokens: 997867520 | elapsed time per iteration (ms): 110400.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.475244E+00 | loss scale: 131072.0 | grad norm: 38281.132 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3255/ 292968 | consumed samples: 6666240 | consumed tokens: 998359040 | elapsed time per iteration (ms): 111582.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.512143E+00 | loss scale: 131072.0 | grad norm: 41348.295 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3256/ 292968 | consumed samples: 6668288 | consumed tokens: 998850560 | elapsed time per iteration (ms): 114295.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.511817E+00 | loss scale: 131072.0 | grad norm: 51514.910 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3257/ 292968 | consumed samples: 6670336 | consumed tokens: 999342080 | elapsed time per iteration (ms): 114201.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.490412E+00 | loss scale: 131072.0 | grad norm: 50568.937 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3258/ 292968 | consumed samples: 6672384 | consumed tokens: 999833600 | elapsed time per iteration (ms): 112390.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.471197E+00 | loss scale: 131072.0 | grad norm: 48187.013 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3259/ 292968 | consumed samples: 6674432 | consumed tokens: 1000325120 | elapsed time per iteration (ms): 112710.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.479379E+00 | loss scale: 131072.0 | grad norm: 40458.309 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3260/ 292968 | consumed samples: 6676480 | consumed tokens: 1000816640 | elapsed time per iteration (ms): 113180.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.473989E+00 | loss scale: 131072.0 | grad norm: 33616.338 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3261/ 292968 | consumed samples: 6678528 | consumed tokens: 1001308160 | elapsed time per iteration (ms): 111586.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.498321E+00 | loss scale: 131072.0 | grad norm: 46783.611 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3262/ 292968 | consumed samples: 6680576 | consumed tokens: 1001799680 | elapsed time per iteration (ms): 111004.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.508714E+00 | loss scale: 131072.0 | grad norm: 45630.732 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3263/ 292968 | consumed samples: 6682624 | consumed tokens: 1002291200 | elapsed time per iteration (ms): 112232.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.504247E+00 | loss scale: 131072.0 | grad norm: 56956.550 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3264/ 292968 | consumed samples: 6684672 | consumed tokens: 1002782720 | elapsed time per iteration (ms): 114908.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.483163E+00 | loss scale: 131072.0 | grad norm: 60085.040 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3265/ 292968 | consumed samples: 6686720 | consumed tokens: 1003274240 | elapsed time per iteration (ms): 113398.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.485216E+00 | loss scale: 131072.0 | grad norm: 50799.684 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3266/ 292968 | consumed samples: 6688768 | consumed tokens: 1003765760 | elapsed time per iteration (ms): 111087.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.498115E+00 | loss scale: 131072.0 | grad norm: 40484.037 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3267/ 292968 | consumed samples: 6690816 | consumed tokens: 1004257280 | elapsed time per iteration (ms): 112981.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.485603E+00 | loss scale: 131072.0 | grad norm: 35172.870 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3268/ 292968 | consumed samples: 6692864 | consumed tokens: 1004748800 | elapsed time per iteration (ms): 112046.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.503227E+00 | loss scale: 131072.0 | grad norm: 36791.981 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3269/ 292968 | consumed samples: 6694912 | consumed tokens: 1005240320 | elapsed time per iteration (ms): 110197.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.492297E+00 | loss scale: 131072.0 | grad norm: 39721.467 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3270/ 292968 | consumed samples: 6696960 | consumed tokens: 1005731840 | elapsed time per iteration (ms): 110041.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.465833E+00 | loss scale: 131072.0 | grad norm: 41592.190 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3271/ 292968 | consumed samples: 6699008 | consumed tokens: 1006223360 | elapsed time per iteration (ms): 110297.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.520051E+00 | loss scale: 131072.0 | grad norm: 38770.837 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3272/ 292968 | consumed samples: 6701056 | consumed tokens: 1006714880 | elapsed time per iteration (ms): 113682.9 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.532229E+00 | loss scale: 131072.0 | grad norm: 46863.674 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3273/ 292968 | consumed samples: 6703104 | consumed tokens: 1007206400 | elapsed time per iteration (ms): 115764.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.487801E+00 | loss scale: 131072.0 | grad norm: 47275.617 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3274/ 292968 | consumed samples: 6705152 | consumed tokens: 1007697920 | elapsed time per iteration (ms): 113611.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.499582E+00 | loss scale: 131072.0 | grad norm: 43028.621 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3275/ 292968 | consumed samples: 6707200 | consumed tokens: 1008189440 | elapsed time per iteration (ms): 111135.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.499293E+00 | loss scale: 131072.0 | grad norm: 43217.821 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3276/ 292968 | consumed samples: 6709248 | consumed tokens: 1008680960 | elapsed time per iteration (ms): 111398.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.495284E+00 | loss scale: 131072.0 | grad norm: 35376.715 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3277/ 292968 | consumed samples: 6711296 | consumed tokens: 1009172480 | elapsed time per iteration (ms): 112414.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.483550E+00 | loss scale: 131072.0 | grad norm: 34250.645 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3278/ 292968 | consumed samples: 6713344 | consumed tokens: 1009664000 | elapsed time per iteration (ms): 111344.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.486138E+00 | loss scale: 131072.0 | grad norm: 30434.955 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3279/ 292968 | consumed samples: 6715392 | consumed tokens: 1010155520 | elapsed time per iteration (ms): 112060.0 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.498874E+00 | loss scale: 131072.0 | grad norm: 29348.389 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3280/ 292968 | consumed samples: 6717440 | consumed tokens: 1010647040 | elapsed time per iteration (ms): 112820.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.497196E+00 | loss scale: 131072.0 | grad norm: 29673.133 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3281/ 292968 | consumed samples: 6719488 | consumed tokens: 1011138560 | elapsed time per iteration (ms): 111234.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.499080E+00 | loss scale: 131072.0 | grad norm: 40415.963 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3282/ 292968 | consumed samples: 6721536 | consumed tokens: 1011630080 | elapsed time per iteration (ms): 111552.6 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.491541E+00 | loss scale: 131072.0 | grad norm: 57029.381 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3283/ 292968 | consumed samples: 6723584 | consumed tokens: 1012121600 | elapsed time per iteration (ms): 112426.3 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.497360E+00 | loss scale: 131072.0 | grad norm: 59242.468 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3284/ 292968 | consumed samples: 6725632 | consumed tokens: 1012613120 | elapsed time per iteration (ms): 111149.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.476556E+00 | loss scale: 131072.0 | grad norm: 45191.526 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3285/ 292968 | consumed samples: 6727680 | consumed tokens: 1013104640 | elapsed time per iteration (ms): 113840.4 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.492275E+00 | loss scale: 131072.0 | grad norm: 36899.796 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3286/ 292968 | consumed samples: 6729728 | consumed tokens: 1013596160 | elapsed time per iteration (ms): 113981.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.471767E+00 | loss scale: 131072.0 | grad norm: 42014.104 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3287/ 292968 | consumed samples: 6731776 | consumed tokens: 1014087680 | elapsed time per iteration (ms): 113840.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.475223E+00 | loss scale: 131072.0 | grad norm: 45709.099 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3288/ 292968 | consumed samples: 6733824 | consumed tokens: 1014579200 | elapsed time per iteration (ms): 112154.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.503000E+00 | loss scale: 131072.0 | grad norm: 46516.672 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3289/ 292968 | consumed samples: 6735872 | consumed tokens: 1015070720 | elapsed time per iteration (ms): 110548.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.484241E+00 | loss scale: 131072.0 | grad norm: 37206.769 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3290/ 292968 | consumed samples: 6737920 | consumed tokens: 1015562240 | elapsed time per iteration (ms): 112012.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.478825E+00 | loss scale: 131072.0 | grad norm: 39774.517 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3291/ 292968 | consumed samples: 6739968 | consumed tokens: 1016053760 | elapsed time per iteration (ms): 110410.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.495184E+00 | loss scale: 131072.0 | grad norm: 38254.934 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3292/ 292968 | consumed samples: 6742016 | consumed tokens: 1016545280 | elapsed time per iteration (ms): 111588.5 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.488030E+00 | loss scale: 131072.0 | grad norm: 43122.399 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3293/ 292968 | consumed samples: 6744064 | consumed tokens: 1017036800 | elapsed time per iteration (ms): 110742.2 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.512937E+00 | loss scale: 131072.0 | grad norm: 42031.635 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3294/ 292968 | consumed samples: 6746112 | consumed tokens: 1017528320 | elapsed time per iteration (ms): 112447.7 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.472189E+00 | loss scale: 131072.0 | grad norm: 44968.571 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3295/ 292968 | consumed samples: 6748160 | consumed tokens: 1018019840 | elapsed time per iteration (ms): 111572.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.486163E+00 | loss scale: 131072.0 | grad norm: 46456.832 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3296/ 292968 | consumed samples: 6750208 | consumed tokens: 1018511360 | elapsed time per iteration (ms): 112407.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.476424E+00 | loss scale: 131072.0 | grad norm: 36053.245 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3297/ 292968 | consumed samples: 6752256 | consumed tokens: 1019002880 | elapsed time per iteration (ms): 111913.8 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.471766E+00 | loss scale: 131072.0 | grad norm: 44322.924 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3298/ 292968 | consumed samples: 6754304 | consumed tokens: 1019494400 | elapsed time per iteration (ms): 112625.1 | learning rate: 1.000E-04 | global batch size: 2048 | lm loss: 3.473320E+00 | loss scale: 131072.0 | grad norm: 50050.388 | num zeros: 0.0 | curriculum seqlen: 240 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) +version https://git-lfs.github.com/spec/v1 +oid sha256:785284eb9d16e7560e950dd6d46cc64c1ac82fd6b41b98ed4b62745e7d8c07fe +size 12208050